<<<<<<< HEAD specificity_paper_main

load gtex here

## import gtex medians data
temp <- tempfile()
download.file("https://storage.googleapis.com/gtex_analysis_v8/rna_seq_data/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct.gz",temp)
gtex <- read.table( temp, skip=2, header = TRUE, sep = "\t")
gtex_rowinfo <- data.frame(Name=gtex$Name, Description=gtex$Description)
rownames(gtex) <- gtex$Name
gtex <- as.matrix(gtex[,3:ncol(gtex)])
unlink(temp); rm(temp)


## import ensembl gene data
ensembl = useEnsembl(biomart="ensembl", dataset="hsapiens_gene_ensembl", GRCh=37)
genes <- getBM(attributes=c('chromosome_name','start_position','end_position','hgnc_symbol', 'ensembl_gene_id','gene_biotype'),
                 filters = list('biotype'='protein_coding'),
                 mart = ensembl, useCache = F)
genes <- genes[which(is.element(genes$chromosome_name, c(1:22, "X", "Y", "MT")) & genes$hgnc_symbol != "" ) ,]

clean gtex dataset here

## show mitochondrial genes drive a large part of sample similarity
## Note sum of medians in gtex not quite 1M - likely artifact of taking medians

temp <- data.frame(names=colnames(gtex), total_transcripts=colSums(gtex))
ggplot( temp ,aes(y=total_transcripts, x=names))+
  geom_col()+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))


## match genes between gtex and ensembl
gtex_names <- str_split_fixed(gtex_rowinfo$Name, "[.]", 2)[,1]
which <- which(genes$chromosome_name != "MT" )
gtex_cleaned <- gtex[which(is.element(gtex_names, genes$ensembl_gene_id[which])),]
which <- which(genes$chromosome_name == "MT" )
gtex_cleanedMT <- gtex[which(is.element(gtex_names, genes$ensembl_gene_id[which])),]

##non-mitochondrial TPM sum
temp <- data.frame(names=colnames(gtex_cleaned), total_transcripts=colSums(gtex_cleaned))
ggplot( temp ,aes(y=total_transcripts, x=names))+
  geom_col()+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))


##mitochondrial TPM sum
temp <- data.frame(names=colnames(gtex_cleanedMT), total_transcripts=colSums(gtex_cleanedMT))
ggplot( temp ,aes(y=total_transcripts, x=names))+
  geom_col()+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))


rm(gtex_cleanedMT)
##renormalize TPM without mitochondrial genes
for(i in 1:ncol(gtex_cleaned)){
  gtex_cleaned[,i] <- (gtex_cleaned[,i]*1e6 / sum(gtex_cleaned[,i]))
}

temp <- data.frame(names=colnames(gtex_cleaned), total_transcripts=colSums(gtex_cleaned))
ggplot( temp ,aes(y=total_transcripts, x=names))+
  geom_col()+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))


gtex <- gtex_cleaned; rm(gtex_cleaned)
exp_mat <- gtex
## log10(TPM+1) transform of data
exp_mat <- log10(exp_mat+1)

## remove mitochondrial contribution
grid.arrange(grobs=grobl_supp, nrow=4, heights=c(5,3,3,3))

## get gene ids (rows from df_1b) that correspond to given GO id/ GO label
## generate small set of brain related GO id/ GO labels
## plot on one v all plot

genes <- getBM(attributes=c('hgnc_symbol', 'ensembl_gene_id','go_id','name_1006'),
               mart = ensembl, useCache = F)
#genes <- load("genes_w_go.RData")

allOE_genes<-str_split_fixed(rownames(exp_mat),"[.]",2)[,1]
ggl <- list()
for(measure in specificity_measures$func_names){
  temp_df <- df_1b[which( df_1b$measure == measure),]
  temp_df_sub <- slice_max(temp_df, order_by=delta, n=nrow(temp_df)/100 )
  sigOE_genes <- str_split_fixed(temp_df_sub$Var1,"[.]",2)[,1]
  ego <- enrichGO(gene = sigOE_genes,
                  universe = allOE_genes,
                  keyType = "ENSEMBL",
                  OrgDb = org.Hs.eg.db,
                  ont = "BP",
                  pAdjustMethod = "BH",
                  qvalueCutoff = 0.05,
                  readable = TRUE,
                  pool=TRUE)

  ggl[[measure]] <- ggplot(temp_df, aes(x=all, y=one))+
    geom_point(size=0.1)+
    geom_point(data=temp_df_sub, aes( x=all, y=one), color="red")+
    ggtitle(measure)+
    geom_abline(slope=1,intercept=0, color="red", size=0.3)+
    theme_bw()+
      theme(panel.grid.minor.x = element_blank())

  n=10
  filename <- paste(figures_dir,"/",measure,"_top1percent_deltas_go.png",sep = "")
  ego <- enrichplot::pairwise_termsim(ego)
  p1 <- ggl[[measure]]
  p2 <- enrichplot::dotplot(ego, showCategory = n, font.size=8, label_format=15)+
          theme(legend.position = "none")

  p3 <- emapplot(ego, showCategory = n, cex_label_category=0.5, cex_category=0.5, cex_line=0.5 )+
    scale_y_discrete(labels=str_wrap(ego@result$Description[1:n], width=15 ))


  grid.arrange(p1, p2,p3, ncol=3) #, widths = c(3,3,4))
  #dev.off()

}
#> wrong orderBy parameter; set to default `orderBy = "x"`
#> Warning: Removed 25284 rows containing missing values (geom_point).
#> wrong orderBy parameter; set to default `orderBy = "x"`

#> Warning: Removed 602 rows containing missing values (geom_point).
#> wrong orderBy parameter; set to default `orderBy = "x"`

#> Warning: Removed 602 rows containing missing values (geom_point).
#> wrong orderBy parameter; set to default `orderBy = "x"`

#> Warning: Removed 602 rows containing missing values (geom_point).

## dot similarity from initial Z scores
dot_sim <- similarity_func(exp_mat)
sim_tree <- cluster_func(dot_sim)
heatmap(dot_sim, Rowv = rev(sim_tree), Colv = sim_tree, scale = "none", margins=c(11,13), symm=T)

sim_tree <- cluster_func(dot_sim)
## plot similarity tree with weights
par(mar=c(2,2,2,15))
sim_tree %>%
  dendextend::set("nodes_pch",19) %>%
  dendextend::set("nodes_cex",2* sqrt(get_nodes_attr(sim_tree,"weight"))) %>%
  plot(horiz=T)

## take a look at the weights
temp <- get_weights(sim_tree, get_leaves_attr(sim_tree, "label"))
temp <- data.frame(weights=temp, names=factor(names(temp), levels=rev(names(temp)) ) )
g <- ggplot( temp ,aes(y=fct_rev(as.factor(names)), x=weights))+
  geom_col() #+
  #theme(axis.text.y = element_text(angle = 90, vjust = 0.5, hjust=1))
g

weights <- get_weights(sim_tree, colnames(exp_mat))
spec_mat_weighted <- calc_weighted_zscore_matrix(exp_mat, weights = weights)
flat <- weights; flat[1:length(flat)] <- 1
spec_mat_flat  <- calc_weighted_zscore_matrix(exp_mat, flat)
## plot to look at global dif between flat and weighted
tau_flat <- calc_weighted_tau(exp_mat, flat)
tau_weighted <- calc_weighted_tau(exp_mat, weights)
tsi_flat <- calc_weighted_tsi(exp_mat,flat)
tsi_weighted <- calc_weighted_tsi(exp_mat,weights)
gini_flat <- calc_weighted_gini(exp_mat, flat)
gini_weighted <- calc_weighted_gini(exp_mat, weights)

hist(spec_mat_flat ,breaks=100)
hist(spec_mat_weighted ,breaks=100)
hist(tau_flat ,breaks=100)
hist(tau_weighted  ,breaks=100)
hist(tsi_flat ,breaks=100)
hist(tsi_weighted ,breaks=100)
hist(gini_flat ,breaks=100)
hist(gini_weighted ,breaks=100)
hist(spec_mat_weighted - spec_mat_flat ,breaks=100)
hist(tau_weighted - tau_flat ,breaks=100)
hist(tsi_weighted - tsi_flat ,breaks=100)
hist(gini_weighted - gini_flat ,breaks=100)

## load data instead
if(TRUE){  ## NOTE! This chunk can require a lot of memory and time depending if running sequential. If you want to look at robustness test results it is recommended you have enabled parallel computing (e.g. check getDoParWorkers > 3 or 4 (preferably more))

  ## Use this variable to control whether performing random or true brain-non-brain partition
  random_partition = FALSE


  robustness_test_results <- list( )
  for(i in 1:length(specificity_measures$func_names)){
    el_names <- paste( c( "P1_", "P2_")
                       , specificity_measures$func_names[i], sep="")
    temp_list <- setNames(list(list(), list()), nm=el_names  )
    robustness_test_results <- c(robustness_test_results, temp_list )
  }


  num_brain_samples <- length(colnames(exp_mat)[grep("[Bb]rain",colnames(exp_mat))])
  num_reps <- 3
  ## use general similarity_func and cluster_func in case we want to test more later
  similarity_func <- function(exp_mat){calc_dot_product_similarity_matrix(calc_zscore_matrix(exp_mat))}
  cluster_func <- function(sim_mat){add_weights(add_dist_to_parent(as.dendrogram(hclust(as.dist(1-sim_mat), method = "single") ) ))}


  ## to switch analysis between True Brain Partition and equalivalent sized Random Partition set random_partition ##
  if(random_partition){
    which <- sample(1:ncol(exp_mat), length(grep("[Bb]rain",colnames(exp_mat),invert = TRUE)))
    P1 <- as.matrix(exp_mat[, which])
    notP1 <- as.matrix(exp_mat[, which(!is.element(1:ncol(exp_mat),which ))])
  }else{
    ## P1 and notP1 are contant throughout, so set these outside loops
    P1 <- as.matrix(exp_mat[,grep("[Bb]rain",colnames(exp_mat), invert = TRUE)])
    notP1 <- as.matrix(exp_mat[,grep("[Bb]rain",colnames(exp_mat), invert = FALSE)])
  }
  ### define 1-n trajectories ahead of time since i+1 should be referenced against i.
  ## do foreach over each element of permuation matrix ## use arrayInd to get row/col given perm number
  permutation_matrix <- matrix(nrow = num_brain_samples, ncol = num_reps )
  for(rep in 1:num_reps){
    permutation_matrix[,rep] <- sample(colnames(notP1),num_brain_samples, replace = FALSE)
  }
  if(!random_partition){
    load("final_permutation_matrix") ## reproduce same used in paper figures
  }
  if(random_partition){
    load("final_random_permutation_matrix") ## reproduce same used in paper figures
  }


  for(measure in specificity_measures$func_names){
    specificity_func <- specificity_measures$funcs[[measure]]
    if(!is.function(specificity_func)){print(paste(measure, "func is not a function")); next;}

      results <- foreach(perm=1:length(permutation_matrix), .errorhandling = "pass", .final = function(x) setNames(x,paste("rep=", arrayInd(1:length(permutation_matrix), dim(permutation_matrix))[,2], ",n=", arrayInd(1:length(permutation_matrix), dim(permutation_matrix ))[,1], sep = ""))) %dopar% {
      library(Hmisc)
      library(dendextend)
      library(reldist)
      sample_indices <- 1:(arrayInd(perm, dim(permutation_matrix))[,1]) ## row is sample name, all names in a row up to top are choices for n=1,2..i
      rep_index <- arrayInd(perm, dim(permutation_matrix))[,2] ## column is replicate number
      sample_set <- permutation_matrix[sample_indices, rep_index]

      # P1_baseline is just P1 at n=1
      P2 <- notP1[,sample_set, drop=F]; colnames(P2) <- make.unique(colnames(P2)) ## make.unique will facilitate sampling with replacement
      P1uP2 <- cbind(P1,P2)

      ## P1uP2 ## exp_mat > get sim mat > get sim tree > get samp weights > get spec mat
      sim_mat <- similarity_func(P1uP2)
      sim_tree <- cluster_func(sim_mat)
      weights <- get_weights(sim_tree, colnames(P1uP2))
      flat <- weights; flat[] <- 1


      specificity_flat <- specificity_func(P1uP2, flat)
      specificity_weighted <- specificity_func(P1uP2, weights)

      # need to fill in these with spec_score matrices for each n for each method
      # names should match
      ### restore once error fixed
      if(specificity_measures$out_type[[measure]]=="matrix"){
        result <- list(
          P1_weighted_spec_score = specificity_weighted[,colnames(P1), drop=F],
          P1_flat_spec_score = specificity_flat[,colnames(P1), drop=F],
          P2_weighted_spec_score = specificity_weighted[,colnames(P2), drop=F],
          P2_flat_spec_score = specificity_flat[,colnames(P2), drop=F]
        )
      }else if(specificity_measures$out_type[[measure]]=="vector"){
        result <- list(P1_weighted_spec_score = specificity_weighted,
                        P1_flat_spec_score = specificity_flat,
                        P2_weighted_spec_score = specificity_weighted,
                        P2_flat_spec_score = specificity_flat
                        )
      }

      }

      results_P1 <- list()
      results_P2 <- list()
      for(i in 1:length(results)){
        name <- names(results)[i]
        rep <- str_split_fixed(names(results)[1], "[=,]", n=4)[,2]
        n <- str_split_fixed(names(results)[1], "[=,]", n=4)[,4]
        results_P1[[name]] <- list(P1_weighted_spec_score = results[[name]]$P1_weighted_spec_score, P1_flat_spec_score = results[[name]]$P1_flat_spec_score)
        results[[name]]$P1_weighted_spec_score <- NULL;  results[[name]]$P1_flat_spec_score <- NULL;  ## large object so delete as we go to avoid copy

        results_P2[[name]] <- list(P2_weighted_spec_score = results[[name]]$P2_weighted_spec_score, P2_flat_spec_score = results[[name]]$P2_flat_spec_score)
        results[[name]]$P2_weighted_spec_score <- NULL;  results[[name]]$P2_flat_spec_score<- NULL;  ## large object so delete as we go to avoid copy
      }

      robustness_test_results[[paste("P1_", measure,sep = "")]] <- results_P1; rm(results_P1)
      robustness_test_results[[paste("P2_", measure,sep = "")]] <- results_P2; rm(results_P2)
  }


} ## end of chunk control
## output to look at is robustness_test_results
#save.image()

df_robustness_test_results <- data.frame(measure=character(),
                                         P1orP2=character(),
                                         weight_or_flat=character(),
                                         n=numeric(),
                                         rep=numeric(),
                                         variance=numeric()
                                         )

for(el in 1:length(robustness_test_results)){
  if(length(robustness_test_results[[el]])==0){next;}
  temp_name <- str_split_fixed(names(robustness_test_results[el]),"_",2)

  temp_len <- length(robustness_test_results[[el]]) ## each has weighted and flat
  temp_df <- data.frame(measure=rep(temp_name[2],temp_len),
                        P1orP2=rep(temp_name[1],temp_len),
                        weight_or_flat=character(temp_len),
                        n=numeric(temp_len),
                        rep=numeric(temp_len),
                        variance=numeric(temp_len))
  temp_df_list <- list("weighted"=temp_df, "flat"=temp_df)
  temp_df_list[["weighted"]]$weight_or_flat <- "weighted"
  temp_df_list[["flat"]]$weight_or_flat <- "flat"

  ## build the temp_dfs in these loops
  for(i in 1:length(robustness_test_results[[el]])){
    for(f_or_w in c("flat","weighted")){
      temp_name <- str_split_fixed(names(robustness_test_results[[el]][i]),"[,=]",4)
      temp_df_list[[f_or_w]]$rep[i] <- as.numeric(temp_name[,grep("rep", temp_name)+1])
      temp_df_list[[f_or_w]]$n[i] <- as.numeric(temp_name[,grep("n", temp_name)+1])
      temp <- robustness_test_results[[el]][[i]]
      temp <- temp[[grep(f_or_w, names(temp))]]

      ## baseline matched on replicate, and is set at n=1
      temp_baseline <- robustness_test_results[[el]][[paste("rep=",temp_name[,grep("rep", temp_name)+1],",n=1",sep="")]]
      temp_baseline <- temp_baseline[[grep(f_or_w, names(temp_baseline))]]

      temp_df_list[[f_or_w]]$variance[i] <- var(as.vector(as.matrix(temp)) - as.vector(as.matrix(temp_baseline)), na.rm=T)
    }
  }
  ## compile temp_dfs into final result
  df_robustness_test_results <- rbind(df_robustness_test_results, temp_df_list[["weighted"]], temp_df_list[["flat"]])

}

df_2c <- df_robustness_test_results
df_2c_summary_by_partition <- aggregate(df_2c$variance,
                                     by=list(measure=df_2c$measure,
                                             P1orP2=df_2c$P1orP2,
                                             weight_or_flat= df_2c$weight_or_flat,
                                             n=df_2c$n),
                                     FUN=function(x) mean(x,na.rm=TRUE))
names(df_2c_summary_by_partition)[ncol(df_2c_summary_by_partition)] <- "mean_var"
df_2c_summary_by_partition$sd_of_var <- aggregate(df_2c$variance,
                                     by=list(measure=df_2c$measure,
                                             P1orP2=df_2c$P1orP2,
                                             weight_or_flat= df_2c$weight_or_flat,
                                             n=df_2c$n),
                                     FUN=function(x) sd(x,na.rm=TRUE))$x


temp <- df_2c_summary_by_partition
df_2c_summary_by_partition$y_min <-  temp$mean_var-temp$sd_of_var/sqrt(num_reps)
df_2c_summary_by_partition$y_max <-  temp$mean_var+temp$sd_of_var/sqrt(num_reps)


ggl <- list()
## Zscore first
for(measure in c("Zscore", "notZscore")){
  if(measure == "Zscore"){
    which <- which(df_2c_summary_by_partition$measure == measure)
    which2 <- which(df_robustness_test_results$measure == measure)
  } else {
    which <- which(df_2c_summary_by_partition$measure != "Zscore")
    which2 <- which(df_robustness_test_results$measure != "Zscore")
  }

  gg <- ggplot(df_2c_summary_by_partition[which,], aes(x=n, y=mean_var)) +
    scale_color_manual(values=c("grey20","steelblue3"))+
    scale_fill_manual(values=c("grey20","steelblue3"))+
    geom_ribbon(aes( group=weight_or_flat, ymin=y_min, ymax=y_max, fill=weight_or_flat),alpha=0.5)+
    geom_line(aes( group=weight_or_flat, ymin=y_min, ymax=y_max, color=weight_or_flat, fill=weight_or_flat))+
    geom_point(data = df_robustness_test_results[which2,], size=0.2,
               aes(x=n, y=variance,group=weight_or_flat,color=weight_or_flat, fill=weight_or_flat))+
    theme_bw()+
    theme(panel.grid.minor.x = element_blank())+
    scale_x_continuous(limits = c(0,13), breaks=seq(0,12,by=1), expand = c(0,0))

  if(measure == "Zscore"){
     gg <- gg+facet_wrap(measure ~ P1orP2 , scales = "free",
                         labeller=labeller(P1orP2=c("P1"="P1 (non-brain)", "P2"="P2 (brain)")))
     gg <- gg+theme(legend.position = "none")
  } else {
     gg <- gg+facet_wrap( ~ measure, ncol = 3)
     gg <- gg+theme(legend.position="bottom")
  }
  ggl[[measure]] <- gg
}
#> Warning: Ignoring unknown aesthetics: ymin, ymax, fill

#> Warning: Ignoring unknown aesthetics: ymin, ymax, fill

grobl <- list()
for(n in names(ggl)){
  grobl[[n]] <- ggplotGrob(ggl[[n]])
}
grid.arrange(grobs=grobl, nrow=2)

### stats reported in paper
## paired t-test for difference in means
### t = delta / sd / sqrt(n)
which_list <- list()
which_list[["Z_P1_flat"]] <- which(df_2c$measure=="Zscore"
                    & df_2c$P1orP2=="P1"
                    & df_2c$n == max(df_2c$n)
                    & df_2c$weight_or_flat=="flat")
which_list[["Z_P1_weighted"]] <- which(df_2c$measure=="Zscore"
                    & df_2c$P1orP2=="P1"
                    & df_2c$n == max(df_2c$n)
                    & df_2c$weight_or_flat=="weighted")
which_list[["Z_P2_flat"]] <- which(df_2c$measure=="Zscore"
                    & df_2c$P1orP2=="P2"
                    & df_2c$n == max(df_2c$n)
                    & df_2c$weight_or_flat=="flat")
which_list[["Z_P2_weighted"]] <- which(df_2c$measure=="Zscore"
                    & df_2c$P1orP2=="P2"
                    & df_2c$n == max(df_2c$n)
                    & df_2c$weight_or_flat=="weighted")


print("t-test P1 weighted / P1 flat : %diff between weighted and flat")
#> [1] "t-test P1 weighted / P1 flat : %diff between weighted and flat"
## t.test on logs gives ratio estimates
temp <- t.test(x=log10(df_2c[which_list$Z_P1_weighted,]$variance),y=log10(df_2c[which_list$Z_P1_flat,]$variance) ,paired=TRUE, conf.level = 0.95)
##  antilog to recover ratio of weighted / flat
1-10^temp$estimate
#> mean of the differences
#>               0.7932209
1-10^temp$conf.int
#> [1] 0.8021299 0.7839107
#> attr(,"conf.level")
#> [1] 0.95
### 79.3% lower variance in weighted than flat (CI)

print("t-test P2 weighted / P2 flat : %diff between weighted and flat")
#> [1] "t-test P2 weighted / P2 flat : %diff between weighted and flat"
temp <- t.test(x=log10(df_2c[which_list$Z_P2_weighted,]$variance),y=log10(df_2c[which_list$Z_P2_flat,]$variance) ,paired=TRUE, conf.level = 0.95)
1-10^temp$estimate
#> mean of the differences
#>               0.3578047
1-10^temp$conf.int
#> [1] 0.4196920 0.2893173
#> attr(,"conf.level")
#> [1] 0.95
### 

## jaccard between n = 1 and n = num_brain_samples for brain v non-brain OR P1 v P2

df_2d <- data.frame(measure=character(), n=numeric(),
                 rep=numeric(), cut_point=numeric(),
                 jaccard=numeric(), P1orP2=character(),
                 weighted=logical(), samp=character(),
                 gene_count=numeric(), gene_count_baseline=numeric(),
                 gene_count_intersect=numeric())
## nested for loop builds df of jaccard results for plotting
Sys.time()
#> [1] "2022-01-02 13:47:37 PST"
for(el in names(robustness_test_results)){
  if(length(robustness_test_results[[el]])==0 ){next;}
  P1orP2 <- str_split_fixed(el,"_",n=2)[1]
  measure <- str_split_fixed(el,"_",n=2)[2]
  if(measure == "Zscore"){cuts <- seq(0,4,0.5)   ## set cutoffs
  }else{ cuts <- seq(0,1,0.05)}

 # for(i in 1:length(robustness_test_results[[el]]) ){   ### foreach at this level if implementing foreach
  df_temp <- foreach(i=1:length(robustness_test_results[[el]]), .errorhandling = "pass", .combine = rbind) %dopar% {
    #foreach(perm=1:length(permutation_matrix), .errorhandling = "pass", .final = function(x) setNames(x,paste("rep=", arrayInd(1:length(permutation_matrix), dim(permutation_matrix))[,2], ",n=", arrayInd(1:length(permutation_matrix), dim(permutation_matrix ))[,1], sep = ""))) %dopar% {
    library(stringr)
    df_internal <- data.frame(measure=character(), n=numeric(),
                              rep=numeric(), cut_point=numeric(),
                              jaccard=numeric(), P1orP2=character(),
                              weighted=logical(), samp=character(),
                              gene_count=numeric(), gene_count_baseline=numeric(),
                              gene_count_intersect=numeric())

    n = as.numeric(str_split_fixed(names(robustness_test_results[[el]][i]), "[=,]", n=4 )[,4])
    rep = as.numeric(str_split_fixed(names(robustness_test_results[[el]][i]), "[=,]", n=4 )[,2])
    for(weighted_or_flat in c("weighted","flat")){
      baseline_name <- names(robustness_test_results[[el]])[grep(paste("rep=",rep,",n=",1,"$", sep = ""),names(robustness_test_results[[el]]) )]

      if(specificity_measures$out_type[measure] == "matrix"){
        weighted_flat_index <- grep(weighted_or_flat,names(robustness_test_results[[el]][[baseline_name]]))
        baseline_mat <-  robustness_test_results[[el]][[baseline_name]][[weighted_flat_index]]
        weighted_flat_index <- grep(weighted_or_flat,names(robustness_test_results[[el]][[i]]))
        temp_mat <- robustness_test_results[[el]][[i]][[weighted_flat_index]][,colnames(baseline_mat),drop=FALSE]
      }else if(specificity_measures$out_type[measure] == "vector"){
        weighted_flat_index <- grep(weighted_or_flat,names(robustness_test_results[[el]][[baseline_name]]))
        baseline_mat <-  as.matrix(robustness_test_results[[el]][[baseline_name]][[weighted_flat_index]])
        weighted_flat_index <- grep(weighted_or_flat,names(robustness_test_results[[el]][[i]]))
        temp_mat <- as.matrix(robustness_test_results[[el]][[i]][[weighted_flat_index]])
      }else{ stop("WARNING: no out type associated with measure") }

      ncols <- ncol(temp_mat); ncuts <- length(cuts)
      temp_df <- data.frame(measure=rep(measure, ncols*ncuts), n=rep(n, ncols*ncuts),
                            rep=rep(rep, ncols*ncuts), cut_point=numeric(length=ncols*ncuts),
                            jaccard=numeric(length=ncols*ncuts), P1orP2=rep(P1orP2, ncols*ncuts),
                            weighted=rep( grepl("weighted", weighted_or_flat), ncols*ncuts),
                            samp=character(length=ncols*ncuts),
                            gene_count=numeric(length=ncols*ncuts), gene_count_baseline=numeric(length=ncols*ncuts),
                            gene_count_intersect=numeric(length=ncols*ncuts)  )
      for(s in 1:ncol(temp_mat) ){  ## add a row to df for each sample
        for(cut_index in 1:length(cuts)){
          ind <- (s-1)*length(cuts)+cut_index  ### use arr_ind here instead
          temp_df[ind,]$cut_point <- cuts[cut_index]
          if(cuts[cut_index]<0){
            temp_df[ind,]$jaccard <- (  length( which( baseline_mat[,s] <= cuts[cut_index] & temp_mat[,s] <= cuts[cut_index]  )) /
                                        length( which( baseline_mat[,s] <= cuts[cut_index] | temp_mat[,s] <= cuts[cut_index]  )) )
            temp_df[ind,]$gene_count <-  length( which(temp_mat[,s] <= cuts[cut_index]  ) )
            temp_df[ind,]$gene_count_baseline <- length( which(baseline_mat[,s] <= cuts[cut_index]  ) )
            temp_df[ind,]$gene_count_intersect <-  length( which( baseline_mat[,s] <= cuts[cut_index] & temp_mat[,s] <= cuts[cut_index]  ))
          } else {
            temp_df[ind,]$jaccard <- (  length( which( baseline_mat[,s] >= cuts[cut_index] & temp_mat[,s] >= cuts[cut_index]  )) /
                                        length( which( baseline_mat[,s] >= cuts[cut_index] | temp_mat[,s] >= cuts[cut_index]  )) )
            temp_df[ind,]$gene_count <-  length( which(temp_mat[,s] >= cuts[cut_index]  ) )
            temp_df[ind,]$gene_count_baseline <-  length( which(baseline_mat[,s] >= cuts[cut_index]  ) )
            temp_df[ind,]$gene_count_intersect <-  length( which( baseline_mat[,s] >= cuts[cut_index] & temp_mat[,s] >= cuts[cut_index]  ))
          }
          temp_df[ind,]$samp <- ifelse(!is.null((colnames(baseline_mat[,s,drop=FALSE] ) )),
                                       unique(colnames(baseline_mat[,s,drop=FALSE] ),
                                              colnames(temp_mat[,s,drop=FALSE] ) ), NA )
        }
      }
      df_internal <- rbind(df_internal, temp_df)
    }
    df_internal
  }
  df_2d <- rbind(df_2d, df_temp)
}
Sys.time()
#> [1] "2022-01-02 14:24:56 PST"


df_2d_summary_by_partition <-aggregate(df_2d$jaccard, by=list(measure=df_2d$measure, n=df_2d$n,
                                                        cut_point=df_2d$cut_point, P1orP2=df_2d$P1orP2,
                                                        weighted=df_2d$weighted),
                                    FUN=function(x) mean(x,na.rm=TRUE) )
names(df_2d_summary_by_partition)[length(names(df_2d_summary_by_partition))] <- "jaccard_mean"
df_2d_summary_by_partition$jaccard_sd <- aggregate(df_2d$jaccard, by=list(measure=df_2d$measure, n=df_2d$n,
                                                        cut_point=df_2d$cut_point, P1orP2=df_2d$P1orP2,
                                                        weighted=df_2d$weighted) ,
                                                FUN=function(x) sd(x,na.rm = TRUE) )$x
df_2d_summary_by_partition$gene_count <- aggregate(df_2d$gene_count,  by=list(measure=df_2d$measure, n=df_2d$n,
                                                        cut_point=df_2d$cut_point, P1orP2=df_2d$P1orP2,
                                                        weighted=df_2d$weighted)
                                                , FUN=function(x) mean(x,na.rm = TRUE) )$x
df_2d_summary_by_partition$gene_count_baseline <- aggregate(df_2d$gene_count_baseline, by=list(measure=df_2d$measure, n=df_2d$n,
                                                        cut_point=df_2d$cut_point, P1orP2=df_2d$P1orP2,
                                                        weighted=df_2d$weighted),
                                                        FUN=function(x) mean(x,na.rm = TRUE) )$x
df_2d_summary_by_partition$gene_count_intersect <- aggregate(df_2d$gene_count_intersect, by=list(measure=df_2d$measure, n=df_2d$n,
                                                        cut_point=df_2d$cut_point, P1orP2=df_2d$P1orP2,
                                                        weighted=df_2d$weighted),
                                                        FUN=function(x) mean(x,na.rm = TRUE) )$x



ggl_jacc <- list()
for(measure in c("Zscore", "notZscore")){
  if(measure == "Zscore"){
    which <- which(df_2d_summary_by_partition$measure == measure)
  } else {
    which <- which(df_2d_summary_by_partition$measure != "Zscore")
  }
  gg <- ggplot(df_2d_summary_by_partition[which,],
               aes(x=cut_point, y=jaccard_mean,
                   ymin=jaccard_mean-jaccard_sd/num_reps,
                   ymax=jaccard_mean+jaccard_sd/num_reps,
                   group=n, fill=n))+
    theme_bw()+
    theme(
          panel.grid.minor.x = element_blank())+
    #ggtitle(paste("jaccard index for ", measure, " P1 and  P2, weighted and flat measures",sep=""))+
    geom_line(aes(color=n))+
    scale_color_continuous(low="powderblue", high="steelblue4")+
    scale_y_continuous(limits = c(0,1), breaks=seq(0,1,by=0.2))+
    facet_grid(P1orP2 ~ weighted, labeller=labeller(weighted=c("TRUE"="weighted", "FALSE"="flat")) )
  if(measure == "Zscore"){
    gg <- gg+scale_x_continuous(limits = c(0,4), breaks=seq(0,4,by=0.5), expand = c(0,0))
    gg <- gg + facet_grid(weighted ~ P1orP2, labeller=labeller(weighted=c("TRUE"="weighted", "FALSE"="flat")) )
  } else {
    gg <- gg+scale_x_continuous(limits = c(0,1),  breaks=seq(0,1,by=0.1) ,expand = c(0,0))
    gg <- gg + facet_grid(weighted ~ measure, labeller=labeller(weighted=c("TRUE"="weighted", "FALSE"="flat")) )
  }

  ggl_jacc[[measure]] <- gg
}

grobl <- list()
for(n in names(ggl_jacc)){
  grobl[[n]] <- ggplotGrob(ggl_jacc[[n]])
}
grid.arrange(grobs =list(grobl$Zscore,grobl$notZscore), nrow=2)


## for random supplemental
if(random_partition){
  grid.arrange(grobs = grobl, nrow=2, heights=c(5,4) )
}



ggl_count <- list()
for(measure in c("Zscore", "notZscore")){
  if(measure == "Zscore"){
    which <- which(df_2d_summary_by_partition$measure == measure)
  } else {
    which <- which(df_2d_summary_by_partition$measure != "Zscore")
  }
  gg <- ggplot(df_2d_summary_by_partition[which,],
               aes(x=cut_point, y=gene_count / nrow(genes),
                   group=n, fill=n))+
    theme_bw()+
    theme(
          panel.grid.minor.x = element_blank())+
    geom_line(aes(color=n))+
    scale_color_continuous(low="steelblue2", high="steelblue4")+
    #scale_y_log10( #limits = c(1,3e4),
    scale_y_continuous( limits=c(1e-4,1) , trans = log10_trans(),
                       breaks = trans_breaks("log10", function(x) 10^x), # ) #,
                       labels = percent,  )

  if(measure == "Zscore"){
    gg <- gg+scale_x_continuous(limits = c(0,4), breaks=seq(0,4,by=0.5), expand = c(0,0))
    gg <- gg + facet_grid(weighted ~ P1orP2, labeller=labeller(weighted=c("TRUE"="weighted", "FALSE"="flat")) )
  } else {
    gg <- gg+scale_x_continuous(limits = c(0,1),  breaks=seq(0,1,by=0.1) ,expand = c(0,0))
    gg <- gg + facet_grid(weighted ~ measure, labeller=labeller(weighted=c("TRUE"="weighted", "FALSE"="flat")) )
  }
  ggl_count[[measure]] <- gg
}
grobl <- list()
for(n in names(ggl_count)){
  grobl[[n]] <- ggplotGrob(ggl_count[[n]])
}
#> Warning: Transformation introduced infinite values in continuous y-axis
grid.arrange(grobs =  list(grobl$Zscore, grobl$notZscore), nrow=2)


## for random supplemental
if(random_partition){
  grid.arrange(grobs = grobl, nrow=2, heights=c(5,4) )
}
### stats reported in paper
## paired t-test for difference in means
### t = delta / sd / sqrt(n)
which_list <- list()
which_list[["Z_P1_flat"]] <- which(df_2d$measure=="Zscore"
                                   & df_2d$n==max(df_2d$n)
                                   & df_2d$cut_point==2
                                   & df_2d$P1orP2=="P1"
                                   & df_2d$weighted==FALSE)
which_list[["Z_P1_weighted"]] <- which(df_2d$measure=="Zscore"
                                   & df_2d$n==max(df_2d$n)
                                   & df_2d$cut_point==2
                                   & df_2d$P1orP2=="P1"
                                   & df_2d$weighted==TRUE)

which_list[["Z_P2_flat"]] <- which(df_2d$measure=="Zscore"
                                   & df_2d$n==max(df_2d$n)
                                   & df_2d$cut_point==2
                                   & df_2d$P1orP2=="P2"
                                   & df_2d$weighted==FALSE)
which_list[["Z_P2_weighted"]] <- which(df_2d$measure=="Zscore"
                                   & df_2d$n==max(df_2d$n)
                                   & df_2d$cut_point==2
                                   & df_2d$P1orP2=="P2"
                                   & df_2d$weighted==TRUE)



print("t-test P2 flat jaccard")
#> [1] "t-test P2 flat jaccard"
temp <- t.test(x=df_2d[which_list$Z_P2_flat,]$jaccard, conf.level = 0.95)
temp$estimate
#> mean of x
#> 0.2513928
temp$conf.int
#> [1] 0.1101765 0.3926090
#> attr(,"conf.level")
#> [1] 0.95

print("t-test P2 weighted jaccard")
#> [1] "t-test P2 weighted jaccard"
temp <- t.test(x=df_2d[which_list$Z_P2_weighted,]$jaccard, conf.level = 0.95)
temp$estimate
#> mean of x
#> 0.6359852
temp$conf.int
#> [1] 0.5740172 0.6979532
#> attr(,"conf.level")
#> [1] 0.95

print("t-test P1 flat jaccard")
#> [1] "t-test P1 flat jaccard"
temp <- t.test(x=df_2d[which_list$Z_P1_flat,]$jaccard, conf.level = 0.95)
temp$estimate
#> mean of x
#> 0.6495905
temp$conf.int
#> [1] 0.6367873 0.6623936
#> attr(,"conf.level")
#> [1] 0.95

print("t-test P1 weighted jaccard")
#> [1] "t-test P1 weighted jaccard"
temp <- t.test(x=df_2d[which_list$Z_P1_weighted,]$jaccard, conf.level = 0.95)
temp$estimate
#> mean of x
#>  0.807743
temp$conf.int
#> [1] 0.7983065 0.8171795
#> attr(,"conf.level")
#> [1] 0.95
if(TRUE){


  ### Matrix built from Z score first then add columns of change in other scores
  ## NOT going to be general to arbitrary sets of matrix measures -> use only Z score and then concatenate other measures
  spec_mats <- list()

  sim_mat <- similarity_func(exp_mat)
  sim_tree <- cluster_func(sim_mat)
  weights <- get_weights(sim_tree, colnames(exp_mat))
  flat <- weights; flat[] <- 1
  for(measure in specificity_measures$func_names){
    specificity_func <- specificity_measures$funcs[[measure]]
    if(!is.function(specificity_func)){print(paste(measure, "func is not a function")); next;}
    if(specificity_measures$out_type[measure]=="matrix"){
      spec_mat_flat <- specificity_func(exp_mat, flat)
      spec_mat_weighted <- specificity_func(exp_mat, weights)
    } else if (specificity_measures$out_type[measure]=="vector"){
      spec_mat_flat <- as.matrix(specificity_func(exp_mat, flat))
      rownames(spec_mat_flat) <- rownames(exp_mat)
      spec_mat_weighted <- as.matrix(specificity_func(exp_mat, weights))
      rownames(spec_mat_weighted) <- rownames(exp_mat)
    } else { stop("WARNING: no out type associated with measure") }

    spec_mats[[paste("delta", measure,sep = "_")]] <- spec_mat_weighted - spec_mat_flat
    spec_mats[[paste("weighted", measure,sep = "_")]] <- spec_mat_weighted
    spec_mats[[paste("flat", measure,sep = "_")]] <- spec_mat_flat
  }

  ## look at top n and bottom n genes with greatest change in specificity score between weighted and flat
  gene_list <- list(up=list(),down=list())
  for(spec_mat_name in names(spec_mats)[grep("delta", names(spec_mats))] ){
    spec_mat <- spec_mats[[spec_mat_name]]
    measure <- str_split_fixed(spec_mat_name, "_", 2)[,2]
    ## for each tissue for matrix measures
    if(specificity_measures$out_type[measure]=="matrix"){
      n=5 ## top n for each tissue (column) of matrix
      temp_name <- measure
      gene_list$up[[temp_name]] <- data.frame("gene"=numeric())
      gene_list$down[[temp_name]] <- data.frame("gene"=numeric())
      ## go in same order as similarity tree
      for(i in get_leaves_attr(sim_tree, "label")){
        gene_list$up[[temp_name]] <- rbind(gene_list$up[[temp_name]],
                                           data.frame("gene"=rownames(spec_mat_flat)[order((spec_mat[,i]), decreasing = TRUE)[1:n]]))
      }
      for(i in get_leaves_attr(sim_tree, "label")){
        gene_list$down[[temp_name]] <- rbind(gene_list$down[[temp_name]],
                                           data.frame("gene" = rownames(spec_mat_flat)[order((spec_mat[,i]*(-1)), decreasing = TRUE)[1:n]]))
        }
    } else if (specificity_measures$out_type[measure]=="vector"){
    } else { stop("WARNING: no out type associated with measure") }

  }

  ### gene lists generated now to organize into heatmaps
    ## set up color functions for heatmaps
  col_funs <- list()
  for(type in c("delta","flat","weighted")){
    col_funs[[paste( type, "Zscore" ,sep="_")]] <-colorRamp2(c(min(c(-2, min(spec_mats[[paste( type, "Zscore" ,sep="_")]], na.rm = T)) ),
                                        -1.99, -1, 0, 1, 1.99,
                                        max(c(2, max( spec_mats$delta_Zscore, na.rm = T) ) )),
                                      c("darkblue", "blue", "#B8B8FF" ,"white", "#FFB8B8",  "red","red4"))
  }

  ## get genes in top and bottom (note: genes may be repeated since want to see top n for each group )
  gene_vec <- data.frame(gene = character())
  for(ud in names(gene_list)){
    for(measure in names(gene_list[[ud]])){
      gene_vec <- rbind(gene_vec, gene_list[[ud]][[measure]])
    }
  }

  heatmaps <- list()
  for(type in c("delta", "flat", "weighted")){
    heatmaps[[type]] <- HeatmapList()
    for(measure in names(gene_list[["up"]])){
      spec_mat_name <- names(spec_mats)[intersect(grep(measure, names(spec_mats)), grep(type, names(spec_mats)))]
      spec_mat <-  spec_mats[[spec_mat_name ]]
      temp_name <- paste(spec_mat_name, sep="_")
      if(measure=="Zscore"){
        h1 <- Heatmap(spec_mat[gene_vec$gene,], name=temp_name,
                      row_order = gene_vec$gene, show_row_names = FALSE,
                      cluster_columns = sim_tree,
                      column_order = colnames(spec_mat),
                      col = col_funs[[paste(type,measure, sep="_")]])
      } else {
        h1 <- Heatmap(spec_mat[gene_vec$gene,], name=temp_name,
                      row_order = gene_vec$gene, show_row_names = FALSE,
                      col = col_funs[[paste(type,measure, sep="_")]])
      }
      if(length(heatmaps[[type]])==0){
        heatmaps[[type]] <- h1
      } else {
        heatmaps[[type]] <- heatmaps[[type]] + h1
      }
    }
  }

heatmaps$delta
heatmaps$flat + heatmaps$weighted
}


head(gtex_rowinfo)
#>                Name Description
#> 1 ENSG00000223972.5     DDX11L1
#> 2 ENSG00000227232.5      WASH7P
#> 3 ENSG00000278267.1   MIR6859-1
#> 4 ENSG00000243485.5 MIR1302-2HG
#> 5 ENSG00000237613.2     FAM138A
#> 6 ENSG00000268020.3      OR4G4P

specific_genes <- c( "OLIG1","OLIG2",
                     "PRSS1", "PTF1A",
                     "MYH6", "MYL4")
spec_genes_w_ids <- gtex_rowinfo[which( is.element(gtex_rowinfo$Description, specific_genes)),]
spec_genes_w_ids <- spec_genes_w_ids[which(is.element(spec_genes_w_ids$Name, rownames(spec_mats$delta_Zscore))),]
spec_genes_w_ids <- spec_genes_w_ids[match(specific_genes, spec_genes_w_ids$Description),]
spec_genes_w_ids <- merge( data.frame(f_or_w = c("flat","weighted")),spec_genes_w_ids)
spec_genes_w_ids <- spec_genes_w_ids[complete.cases(spec_genes_w_ids),]


col_funs <- list()
for(measure in unique(str_split_fixed(names(spec_mats), "_", 2)[,2] )){
  if(measure=="Zscore"){
    col_funs[[measure]] <-colorRamp2( c( min(c(min(spec_mats$weighted_Zscore[spec_genes_w_ids$Name,], na.rm = TRUE ),
                                             min(spec_mats$flat_Zscore[spec_genes_w_ids$Name,], na.rm = TRUE),-4)),
                                    #   -2, -1.99, 0, 1.99, 2, 3.99 ,
                                      -3, -2, 0, 2, 3, 3.99,
                                       max(c(max( spec_mats$weighted_Zscore[spec_genes_w_ids$Name,], na.rm = TRUE ),
                                             max( spec_mats$flat_Zscore[spec_genes_w_ids$Name,]), na.rm = TRUE),4)),
                                     c("darkblue", "blue", "#aaaaff" ,"white", "#ffaaaa",  "red","red4", "grey20"))
                                    #c("darkblue", "blue", "#aaaaff" ,"white", "#ffaaaa",  "red","red4"))
  } else {
    col_funs[[measure]] <-colorRamp2(c( 0, 0.4, 0.6, 0.8, 1),
                                     c("white", "lightgoldenrod1","yellow" ,"red","red4"))
    #col_funs[[measure]] <-colorRamp2(c( 0, 0.5, 1),
    #                                 c("white","red","red4"))
  }
}


heatmaps <- HeatmapList()
for(measure in unique(str_split_fixed(names(spec_mats), "_", 2)[,2] )){
  temp_mat <- matrix(nrow=nrow(spec_genes_w_ids),ncol=ncol(spec_mats[[paste("delta",measure, sep = "_")]]))
  colnames(temp_mat) <- colnames(spec_mats[[paste("delta",measure, sep = "_")]])
  rownames(temp_mat) <- spec_genes_w_ids$Name
  for(i in 1:nrow(spec_genes_w_ids)){
    temp_mat[i,] <- spec_mats[[paste(spec_genes_w_ids$f_or_w[i], measure, sep = "_")]][spec_genes_w_ids$Name[i],]
  }
  if(measure=="Zscore"){
    split <- rep(1:(nrow(spec_genes_w_ids)/2), each=2)
    ra <- rowAnnotation(foo = anno_block(labels =  paste(unique(spec_genes_w_ids$Description), "\nW     F")))
    h1 <- Heatmap(temp_mat, name=measure,
                  cluster_rows = FALSE, show_row_names = FALSE,
                  cluster_columns = sim_tree,
                  column_order = colnames(spec_mats$delta_Zscore),
                  row_split = split,
                  left_annotation = ra,
                  col = col_funs[[measure]]
                  )
    } else {
      # h1 <- Heatmap(temp_mat, name=measure,
      #               cluster_rows = FALSE, show_row_names = TRUE,
      #               col = col_funs[[measure]])
      }
  if(length(heatmaps)==0){
      heatmaps <- h1
      # } else {
      # heatmaps <- heatmaps + h1
    }
}
draw(heatmaps)




## get particular values reported in paper
measure="Zscore"
temp_mat <- matrix(nrow=nrow(spec_genes_w_ids),ncol=ncol(spec_mats[[paste("delta",measure, sep = "_")]]))
colnames(temp_mat) <- colnames(spec_mats[[paste("delta",measure, sep = "_")]])
rownames(temp_mat) <- spec_genes_w_ids$Name
for(i in 1:nrow(spec_genes_w_ids)){
  temp_mat[i,] <- spec_mats[[paste(spec_genes_w_ids$f_or_w[i], measure, sep = "_")]][spec_genes_w_ids$Name[i],]
}
rownames(temp_mat) <- paste(c("flat","weighted"), spec_genes_w_ids$Description)
## gene ontology summaries
library(DOSE)
library(pathview)
library(clusterProfiler)
library(org.Hs.eg.db)

library(tidyverse)
library(topGO)
library(enrichplot)

spec_mat_weighted <- calc_weighted_zscore_matrix(exp_mat, weights = weights)
flat <- weights; flat[1:length(flat)] <- 1
spec_mat_flat  <- calc_weighted_zscore_matrix(exp_mat, flat)

cutoff <- 2
which <- which((spec_mat_weighted >= cutoff & spec_mat_flat < cutoff))

allOE_genes<-str_split_fixed(rownames(spec_mat_flat),"[.]",2)[,1]

sigOE_genes <- character()
for(j in 1:ncol(spec_mat_weighted)){
 which<-which((spec_mat_flat[,j]<=2.0 & spec_mat_weighted[,j]>=2.0))
which<-as.matrix(which)
  which<- rownames(which)
  which<-str_split_fixed(which, "[.]", 2)
  which<- which[,1]
   sigOE_genes <- c(sigOE_genes, which)
}
sigOE_genes<-unique(sigOE_genes)


for(onto in c("BP")){ #},"MF","CC")){
  print(onto)
  ego <- enrichGO(gene = sigOE_genes,
                  universe = allOE_genes,
                  keyType = "ENSEMBL",
                  OrgDb = org.Hs.eg.db,
                  ont = onto,
                  pAdjustMethod = "BH",
                  qvalueCutoff = 0.05,
                  readable = TRUE,
                  pool=TRUE)

  n=10
  print(dotplot(ego, showCategory = n))

  ego <- enrichplot::pairwise_termsim(ego)
  print(emapplot(ego, showCategory = n))

}
#> [1] "BP"
#> wrong orderBy parameter; set to default `orderBy = "x"`

cluster_summary <- data.frame(ego)
cluster_summary$frac_terms <- as.numeric(str_split_fixed(cluster_summary$GeneRatio,"/",2)[,1] ) / as.numeric( str_split_fixed(cluster_summary$BgRatio,"/",2)[,1] )


## only 1 similarity function tested for now, can make as list later
similarity_func <- function(exp_mat){calc_dot_product_similarity_matrix(calc_zscore_matrix(exp_mat))}

## only 1 clustering fucntion tested for now, can make as a list later
cluster_func <- function(sim_mat, method="single"){
  add_weights(add_dist_to_parent(as.dendrogram(hclust(as.dist(1-sim_mat), method = method) ) ))
}

sim_mat <- similarity_func(exp_mat)
cluster_methods <- c("single","complete","average")
for(cm in cluster_methods){
  par(mar = c(16,2,2,2))
  sim_tree <- cluster_func(sim_mat = sim_mat, method = cm)
  sim_tree %>% plot(main=cm)
}

### to do: justify method for measuring sample similarity - may require comparison or refs
calc_dot_product_similarity_matrix <- function(dat) {
  dot_product_similarity_matrix <- matrix(0, nrow = ncol(dat), ncol = ncol(dat))
  colnames(dot_product_similarity_matrix) <- colnames(dat)
  rownames(dot_product_similarity_matrix) <- colnames(dat)
  for(i in 1:ncol(dat)){
    for(j in 1:ncol(dat)){
      which_i <- which(!is.na(dat[,i])) ## ignore NAs
      which_j <- which(!is.na(dat[,j])) ## ignore NAs
      dot_product_similarity_matrix[i,j] <- sum(dat[which_i,i] * dat[which_j,j]) / (norm(dat[which_i,i],"2")*norm(dat[which_j,j],"2"))
    }
  }
  return(dot_product_similarity_matrix)
}
sim_mat<- (similarity_func(exp_mat))
sim_mat <- (sim_mat-min(sim_mat,na.rm=T)); sim_mat <- sim_mat/max(sim_mat,na.rm = T) ## coerce domain to [0-1]
sim_tree <- cluster_func(sim_mat)
heatmap(sim_mat, Rowv = rev(sim_tree), Colv = sim_tree, scale = "none", margins=c(11,13))



####ADDED
calc_eucl_matrix<- function(dat) {
  eucl<- matrix(0, nrow = ncol(dat), ncol = ncol(dat))
  colnames(eucl) <- colnames(dat)
  rownames(eucl) <- colnames(dat)
  for(i in 1:ncol(dat)){
    for(j in 1:ncol(dat)) {
       eucl[i,j]<-sqrt(sum((dat[,i]-dat[,j])^2,na.rm = T ) )
    }
  }
  return(eucl)
}
eucl_matrix<- 1-(calc_eucl_matrix(calc_zscore_matrix(exp_mat)))
eucl_matrix <- (eucl_matrix-min(eucl_matrix,na.rm=T)); eucl_matrix <- eucl_matrix/max(eucl_matrix,na.rm = T) ## coerce domain to [0-1]
sim_tree <- cluster_func(eucl_matrix)
heatmap(eucl_matrix, Rowv = rev(sim_tree), Colv = sim_tree, scale = "none", margins=c(11,13))



####ADDED
calc_manhattan_matrix<- function(dat) {
  manhatt<- matrix(0, nrow = ncol(dat), ncol = ncol(dat))
  colnames(manhatt) <- colnames(dat)
  rownames(manhatt) <- colnames(dat)
  for(i in 1:ncol(dat)){
    for(j in 1:ncol(dat)) {
       manhatt[i,j]<-sum(abs(dat[,i]-dat[,j]),na.rm = T)
    }
  }
  for(i in 1:nrow(manhatt)){
    manhatt[,i] <- manhatt[,i] / max(manhatt,na.rm=T )
  }
  return(manhatt)
}
manhat_matrix<- 1-(calc_manhattan_matrix(calc_zscore_matrix(exp_mat)))
manhat_matrix <- (manhat_matrix-min(manhat_matrix,na.rm=T)); manhat_matrix <- manhat_matrix/max(manhat_matrix,na.rm = T) ## coerce domain to [0-1]
sim_tree <- cluster_func(manhat_matrix)
heatmap(manhat_matrix, Rowv = sim_tree, Colv = rev(sim_tree), scale = "none", margins=c(11,13))



calc_canberra_matrix<- function(dat) {
  canberra<- matrix(0, nrow = ncol(dat), ncol = ncol(dat))
  colnames(canberra) <- colnames(dat)
  rownames(canberra) <- colnames(dat)
  for(i in 1:ncol(dat)){
    for(j in 1:ncol(dat)) {
      canberra[i,j]<- sum(abs(dat[,i] - dat[,j])/ (abs(dat[,i])+abs(dat[,j])) , na.rm = T)
    }
  }
  for(i in 1:nrow(canberra)){
    canberra[,i] <- canberra[,i] / max(canberra,na.rm=T )
  }
  return(canberra)
}
canberra_mat<- 1-(calc_canberra_matrix( calc_zscore_matrix(exp_mat)))
canberra_mat <- (canberra_mat-min(canberra_mat,na.rm=T)); canberra_mat <- canberra_mat/max(canberra_mat,na.rm = T) ## coerce domain to [0-1]
sim_tree <- cluster_func(canberra_mat)
heatmap(canberra_mat, Rowv = sim_tree, Colv = rev(sim_tree), scale = "none", margins=c(11,13))

======= specificity_paper_main

load gtex here

## import gtex medians data
temp <- tempfile()
download.file("https://storage.googleapis.com/gtex_analysis_v8/rna_seq_data/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct.gz",temp)
gtex <- read.table( temp, skip=2, header = TRUE, sep = "\t")
gtex_rowinfo <- data.frame(Name=gtex$Name, Description=gtex$Description)
rownames(gtex) <- gtex$Name
gtex <- as.matrix(gtex[,3:ncol(gtex)])
unlink(temp); rm(temp)


## import ensembl gene data
ensembl = useEnsembl(biomart="ensembl", dataset="hsapiens_gene_ensembl", GRCh=37)
genes <- getBM(attributes=c('chromosome_name','start_position','end_position','hgnc_symbol', 'ensembl_gene_id','gene_biotype'),
                 filters = list('biotype'='protein_coding'),
                 mart = ensembl, useCache = F) 
genes <- genes[which(is.element(genes$chromosome_name, c(1:22, "X", "Y", "MT")) & genes$hgnc_symbol != "" ) ,]

clean gtex dataset here

## show mitochondrial genes drive a large part of sample similarity
## Note sum of medians in gtex not quite 1M - likely artifact of taking medians

temp <- data.frame(names=colnames(gtex), total_transcripts=colSums(gtex))
ggplot( temp ,aes(y=total_transcripts, x=names))+
  geom_col()+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))


## match genes between gtex and ensembl
gtex_names <- str_split_fixed(gtex_rowinfo$Name, "[.]", 2)[,1]
which <- which(genes$chromosome_name != "MT" )
gtex_cleaned <- gtex[which(is.element(gtex_names, genes$ensembl_gene_id[which])),]
which <- which(genes$chromosome_name == "MT" )
gtex_cleanedMT <- gtex[which(is.element(gtex_names, genes$ensembl_gene_id[which])),]

##non-mitochondrial TPM sum
temp <- data.frame(names=colnames(gtex_cleaned), total_transcripts=colSums(gtex_cleaned))
ggplot( temp ,aes(y=total_transcripts, x=names))+
  geom_col()+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))


##mitochondrial TPM sum
temp <- data.frame(names=colnames(gtex_cleanedMT), total_transcripts=colSums(gtex_cleanedMT))
ggplot( temp ,aes(y=total_transcripts, x=names))+
  geom_col()+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))


rm(gtex_cleanedMT)
##renormalize TPM without mitochondrial genes
for(i in 1:ncol(gtex_cleaned)){
  gtex_cleaned[,i] <- (gtex_cleaned[,i]*1e6 / sum(gtex_cleaned[,i]))
}

temp <- data.frame(names=colnames(gtex_cleaned), total_transcripts=colSums(gtex_cleaned))
ggplot( temp ,aes(y=total_transcripts, x=names))+
  geom_col()+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))


gtex <- gtex_cleaned; rm(gtex_cleaned)
exp_mat <- gtex
## log10(TPM+1) transform of data
exp_mat <- log10(exp_mat+1)

## remove mitochondrial contribution
grid.arrange(grobs=grobl_supp, nrow=4, heights=c(5,3,3,3))

## get gene ids (rows from df_1b) that correspond to given GO id/ GO label
## generate small set of brain related GO id/ GO labels
## plot on one v all plot

genes <- getBM(attributes=c('hgnc_symbol', 'ensembl_gene_id','go_id','name_1006'),
               mart = ensembl, useCache = F)
#genes <- load("genes_w_go.RData")

allOE_genes<-str_split_fixed(rownames(exp_mat),"[.]",2)[,1]
ggl <- list()
for(measure in specificity_measures$func_names){
  temp_df <- df_1b[which( df_1b$measure == measure),]
  temp_df_sub <- slice_max(temp_df, order_by=delta, n=nrow(temp_df)/100 )
  sigOE_genes <- str_split_fixed(temp_df_sub$Var1,"[.]",2)[,1]
  ego <- enrichGO(gene = sigOE_genes, 
                  universe = allOE_genes,
                  keyType = "ENSEMBL",
                  OrgDb = org.Hs.eg.db, 
                  ont = "BP", 
                  pAdjustMethod = "BH",
                  qvalueCutoff = 0.05,
                  readable = TRUE,
                  pool=TRUE)
  
  ggl[[measure]] <- ggplot(temp_df, aes(x=all, y=one))+
    geom_point(size=0.1)+
    geom_point(data=temp_df_sub, aes( x=all, y=one), color="red")+
    ggtitle(measure)+
    geom_abline(slope=1,intercept=0, color="red", size=0.3)+
    theme_bw()+
      theme(panel.grid.minor.x = element_blank())
  
  n=10
  filename <- paste(figures_dir,"/",measure,"_top1percent_deltas_go.png",sep = "")
  ego <- enrichplot::pairwise_termsim(ego)
  p1 <- ggl[[measure]]
  p2 <- enrichplot::dotplot(ego, showCategory = n, font.size=8, label_format=15)+
          theme(legend.position = "none")
    
  p3 <- emapplot(ego, showCategory = n, cex_label_category=0.5, cex_category=0.5, cex_line=0.5 )+
    scale_y_discrete(labels=str_wrap(ego@result$Description[1:n], width=15 ))


  grid.arrange(p1, p2,p3, ncol=3) #, widths = c(3,3,4))
  #dev.off()

}
#> wrong orderBy parameter; set to default `orderBy = "x"`
#> Warning: Removed 25284 rows containing missing values (geom_point).
#> wrong orderBy parameter; set to default `orderBy = "x"`

#> Warning: Removed 602 rows containing missing values (geom_point).
#> wrong orderBy parameter; set to default `orderBy = "x"`

#> Warning: Removed 602 rows containing missing values (geom_point).
#> wrong orderBy parameter; set to default `orderBy = "x"`

#> Warning: Removed 602 rows containing missing values (geom_point).

## dot similarity from initial Z scores
dot_sim <- similarity_func(exp_mat)
sim_tree <- cluster_func(dot_sim)
heatmap(dot_sim, Rowv = rev(sim_tree), Colv = sim_tree, scale = "none", margins=c(11,13), symm=T)

sim_tree <- cluster_func(dot_sim)
## plot similarity tree with weights
par(mar=c(2,2,2,15))
sim_tree %>%
  dendextend::set("nodes_pch",19) %>% 
  dendextend::set("nodes_cex",2* sqrt(get_nodes_attr(sim_tree,"weight"))) %>%
  plot(horiz=T)

## take a look at the weights
temp <- get_weights(sim_tree, get_leaves_attr(sim_tree, "label"))
temp <- data.frame(weights=temp, names=factor(names(temp), levels=rev(names(temp)) ) )
g <- ggplot( temp ,aes(y=fct_rev(as.factor(names)), x=weights))+
  geom_col() #+
  #theme(axis.text.y = element_text(angle = 90, vjust = 0.5, hjust=1))
g

weights <- get_weights(sim_tree, colnames(exp_mat))
spec_mat_weighted <- calc_weighted_zscore_matrix(exp_mat, weights = weights)
flat <- weights; flat[1:length(flat)] <- 1
spec_mat_flat  <- calc_weighted_zscore_matrix(exp_mat, flat)
## plot to look at global dif between flat and weighted
tau_flat <- calc_weighted_tau(exp_mat, flat)
tau_weighted <- calc_weighted_tau(exp_mat, weights)
tsi_flat <- calc_weighted_tsi(exp_mat,flat)
tsi_weighted <- calc_weighted_tsi(exp_mat,weights)
gini_flat <- calc_weighted_gini(exp_mat, flat)
gini_weighted <- calc_weighted_gini(exp_mat, weights)

hist(spec_mat_flat ,breaks=100)
hist(spec_mat_weighted ,breaks=100)
hist(tau_flat ,breaks=100)
hist(tau_weighted  ,breaks=100)
hist(tsi_flat ,breaks=100)
hist(tsi_weighted ,breaks=100)
hist(gini_flat ,breaks=100)
hist(gini_weighted ,breaks=100)
hist(spec_mat_weighted - spec_mat_flat ,breaks=100)
hist(tau_weighted - tau_flat ,breaks=100)
hist(tsi_weighted - tsi_flat ,breaks=100)
hist(gini_weighted - gini_flat ,breaks=100)

## load data instead
if(TRUE){  ## NOTE! This chunk can require a lot of memory and time depending if running sequential. If you want to look at robustness test results it is recommended you have enabled parallel computing (e.g. check getDoParWorkers > 3 or 4 (preferably more))
  
  ## Use this variable to control whether performing random or true brain-non-brain partition
  random_partition = FALSE
  
  
  robustness_test_results <- list( )
  for(i in 1:length(specificity_measures$func_names)){
    el_names <- paste( c( "P1_", "P2_")
                       , specificity_measures$func_names[i], sep="")
    temp_list <- setNames(list(list(), list()), nm=el_names  )
    robustness_test_results <- c(robustness_test_results, temp_list )
  }
  
  
  num_brain_samples <- length(colnames(exp_mat)[grep("[Bb]rain",colnames(exp_mat))])
  num_reps <- 3
  ## use general similarity_func and cluster_func in case we want to test more later
  similarity_func <- function(exp_mat){calc_dot_product_similarity_matrix(calc_zscore_matrix(exp_mat))}
  cluster_func <- function(sim_mat){add_weights(add_dist_to_parent(as.dendrogram(hclust(as.dist(1-sim_mat), method = "single") ) ))}  
  
  
  ## to switch analysis between True Brain Partition and equalivalent sized Random Partition set random_partition ##
  if(random_partition){
    which <- sample(1:ncol(exp_mat), length(grep("[Bb]rain",colnames(exp_mat),invert = TRUE)))
    P1 <- as.matrix(exp_mat[, which])
    notP1 <- as.matrix(exp_mat[, which(!is.element(1:ncol(exp_mat),which ))])
  }else{
    ## P1 and notP1 are contant throughout, so set these outside loops
    P1 <- as.matrix(exp_mat[,grep("[Bb]rain",colnames(exp_mat), invert = TRUE)])
    notP1 <- as.matrix(exp_mat[,grep("[Bb]rain",colnames(exp_mat), invert = FALSE)])
  }
  ### define 1-n trajectories ahead of time since i+1 should be referenced against i.
  ## do foreach over each element of permuation matrix ## use arrayInd to get row/col given perm number
  permutation_matrix <- matrix(nrow = num_brain_samples, ncol = num_reps )
  for(rep in 1:num_reps){
    permutation_matrix[,rep] <- sample(colnames(notP1),num_brain_samples, replace = FALSE)
  }
  if(!random_partition){
    load("final_permutation_matrix") ## reproduce same used in paper figures
  }
  if(random_partition){
    load("final_random_permutation_matrix") ## reproduce same used in paper figures
  }

  
  for(measure in specificity_measures$func_names){
    specificity_func <- specificity_measures$funcs[[measure]]
    if(!is.function(specificity_func)){print(paste(measure, "func is not a function")); next;}
    
      results <- foreach(perm=1:length(permutation_matrix), .errorhandling = "pass", .final = function(x) setNames(x,paste("rep=", arrayInd(1:length(permutation_matrix), dim(permutation_matrix))[,2], ",n=", arrayInd(1:length(permutation_matrix), dim(permutation_matrix ))[,1], sep = ""))) %dopar% {
      library(Hmisc)
      library(dendextend)
      library(reldist)
      sample_indices <- 1:(arrayInd(perm, dim(permutation_matrix))[,1]) ## row is sample name, all names in a row up to top are choices for n=1,2..i
      rep_index <- arrayInd(perm, dim(permutation_matrix))[,2] ## column is replicate number
      sample_set <- permutation_matrix[sample_indices, rep_index]
      
      # P1_baseline is just P1 at n=1
      P2 <- notP1[,sample_set, drop=F]; colnames(P2) <- make.unique(colnames(P2)) ## make.unique will facilitate sampling with replacement
      P1uP2 <- cbind(P1,P2)
      
      ## P1uP2 ## exp_mat > get sim mat > get sim tree > get samp weights > get spec mat
      sim_mat <- similarity_func(P1uP2)
      sim_tree <- cluster_func(sim_mat)
      weights <- get_weights(sim_tree, colnames(P1uP2))
      flat <- weights; flat[] <- 1
      
      
      specificity_flat <- specificity_func(P1uP2, flat)
      specificity_weighted <- specificity_func(P1uP2, weights)
      
      # need to fill in these with spec_score matrices for each n for each method
      # names should match 
      ### restore once error fixed
      if(specificity_measures$out_type[[measure]]=="matrix"){
        result <- list(
          P1_weighted_spec_score = specificity_weighted[,colnames(P1), drop=F],
          P1_flat_spec_score = specificity_flat[,colnames(P1), drop=F],
          P2_weighted_spec_score = specificity_weighted[,colnames(P2), drop=F],
          P2_flat_spec_score = specificity_flat[,colnames(P2), drop=F]
        )
      }else if(specificity_measures$out_type[[measure]]=="vector"){
        result <- list(P1_weighted_spec_score = specificity_weighted,
                        P1_flat_spec_score = specificity_flat,
                        P2_weighted_spec_score = specificity_weighted,
                        P2_flat_spec_score = specificity_flat
                        )
      }
      
      }
      
      results_P1 <- list()
      results_P2 <- list()
      for(i in 1:length(results)){
        name <- names(results)[i]
        rep <- str_split_fixed(names(results)[1], "[=,]", n=4)[,2]
        n <- str_split_fixed(names(results)[1], "[=,]", n=4)[,4]
        results_P1[[name]] <- list(P1_weighted_spec_score = results[[name]]$P1_weighted_spec_score, P1_flat_spec_score = results[[name]]$P1_flat_spec_score)
        results[[name]]$P1_weighted_spec_score <- NULL;  results[[name]]$P1_flat_spec_score <- NULL;  ## large object so delete as we go to avoid copy
        
        results_P2[[name]] <- list(P2_weighted_spec_score = results[[name]]$P2_weighted_spec_score, P2_flat_spec_score = results[[name]]$P2_flat_spec_score)
        results[[name]]$P2_weighted_spec_score <- NULL;  results[[name]]$P2_flat_spec_score<- NULL;  ## large object so delete as we go to avoid copy
      }
      
      robustness_test_results[[paste("P1_", measure,sep = "")]] <- results_P1; rm(results_P1)
      robustness_test_results[[paste("P2_", measure,sep = "")]] <- results_P2; rm(results_P2)
  }

  
} ## end of chunk control
## output to look at is robustness_test_results
#save.image()

df_robustness_test_results <- data.frame(measure=character(),
                                         P1orP2=character(),
                                         weight_or_flat=character(),
                                         n=numeric(),
                                         rep=numeric(),
                                         variance=numeric()
                                         )

for(el in 1:length(robustness_test_results)){
  if(length(robustness_test_results[[el]])==0){next;}
  temp_name <- str_split_fixed(names(robustness_test_results[el]),"_",2)
  
  temp_len <- length(robustness_test_results[[el]]) ## each has weighted and flat
  temp_df <- data.frame(measure=rep(temp_name[2],temp_len),
                        P1orP2=rep(temp_name[1],temp_len),
                        weight_or_flat=character(temp_len),
                        n=numeric(temp_len),
                        rep=numeric(temp_len),
                        variance=numeric(temp_len))
  temp_df_list <- list("weighted"=temp_df, "flat"=temp_df)
  temp_df_list[["weighted"]]$weight_or_flat <- "weighted"
  temp_df_list[["flat"]]$weight_or_flat <- "flat"
  
  ## build the temp_dfs in these loops
  for(i in 1:length(robustness_test_results[[el]])){
    for(f_or_w in c("flat","weighted")){
      temp_name <- str_split_fixed(names(robustness_test_results[[el]][i]),"[,=]",4)
      temp_df_list[[f_or_w]]$rep[i] <- as.numeric(temp_name[,grep("rep", temp_name)+1])
      temp_df_list[[f_or_w]]$n[i] <- as.numeric(temp_name[,grep("n", temp_name)+1])
      temp <- robustness_test_results[[el]][[i]]
      temp <- temp[[grep(f_or_w, names(temp))]]
      
      ## baseline matched on replicate, and is set at n=1
      temp_baseline <- robustness_test_results[[el]][[paste("rep=",temp_name[,grep("rep", temp_name)+1],",n=1",sep="")]]
      temp_baseline <- temp_baseline[[grep(f_or_w, names(temp_baseline))]]
      
      temp_df_list[[f_or_w]]$variance[i] <- var(as.vector(as.matrix(temp)) - as.vector(as.matrix(temp_baseline)), na.rm=T)
    }
  }
  ## compile temp_dfs into final result
  df_robustness_test_results <- rbind(df_robustness_test_results, temp_df_list[["weighted"]], temp_df_list[["flat"]])
  
}

df_2c <- df_robustness_test_results
df_2c_summary_by_partition <- aggregate(df_2c$variance,
                                     by=list(measure=df_2c$measure,
                                             P1orP2=df_2c$P1orP2,
                                             weight_or_flat= df_2c$weight_or_flat,
                                             n=df_2c$n),
                                     FUN=function(x) mean(x,na.rm=TRUE))
names(df_2c_summary_by_partition)[ncol(df_2c_summary_by_partition)] <- "mean_var"
df_2c_summary_by_partition$sd_of_var <- aggregate(df_2c$variance,
                                     by=list(measure=df_2c$measure,
                                             P1orP2=df_2c$P1orP2,
                                             weight_or_flat= df_2c$weight_or_flat,
                                             n=df_2c$n),
                                     FUN=function(x) sd(x,na.rm=TRUE))$x


temp <- df_2c_summary_by_partition
df_2c_summary_by_partition$y_min <-  temp$mean_var-temp$sd_of_var/sqrt(num_reps)
df_2c_summary_by_partition$y_max <-  temp$mean_var+temp$sd_of_var/sqrt(num_reps)


ggl <- list()
## Zscore first 
for(measure in c("Zscore", "notZscore")){
  if(measure == "Zscore"){
    which <- which(df_2c_summary_by_partition$measure == measure)
    which2 <- which(df_robustness_test_results$measure == measure)  
  } else {
    which <- which(df_2c_summary_by_partition$measure != "Zscore")
    which2 <- which(df_robustness_test_results$measure != "Zscore")
  }
  
  gg <- ggplot(df_2c_summary_by_partition[which,], aes(x=n, y=mean_var)) +
    scale_color_manual(values=c("grey20","steelblue3"))+
    scale_fill_manual(values=c("grey20","steelblue3"))+
    geom_ribbon(aes( group=weight_or_flat, ymin=y_min, ymax=y_max, fill=weight_or_flat),alpha=0.5)+
    geom_line(aes( group=weight_or_flat, ymin=y_min, ymax=y_max, color=weight_or_flat, fill=weight_or_flat))+
    geom_point(data = df_robustness_test_results[which2,], size=0.2,
               aes(x=n, y=variance,group=weight_or_flat,color=weight_or_flat, fill=weight_or_flat))+
    theme_bw()+
    theme(panel.grid.minor.x = element_blank())+
    scale_x_continuous(limits = c(0,13), breaks=seq(0,12,by=1), expand = c(0,0))
  
  if(measure == "Zscore"){
     gg <- gg+facet_wrap(measure ~ P1orP2 , scales = "free",
                         labeller=labeller(P1orP2=c("P1"="P1 (non-brain)", "P2"="P2 (brain)")))
     gg <- gg+theme(legend.position = "none")
  } else {
     gg <- gg+facet_wrap( ~ measure, ncol = 3)
     gg <- gg+theme(legend.position="bottom")
  }
  ggl[[measure]] <- gg
}
#> Warning: Ignoring unknown aesthetics: ymin, ymax, fill

#> Warning: Ignoring unknown aesthetics: ymin, ymax, fill

grobl <- list()
for(n in names(ggl)){
  grobl[[n]] <- ggplotGrob(ggl[[n]])
}
grid.arrange(grobs=grobl, nrow=2)

### stats reported in paper
## paired t-test for difference in means
### t = delta / sd / sqrt(n)
which_list <- list()
which_list[["Z_P1_flat"]] <- which(df_2c$measure=="Zscore" 
                    & df_2c$P1orP2=="P1"
                    & df_2c$n == max(df_2c$n)
                    & df_2c$weight_or_flat=="flat")
which_list[["Z_P1_weighted"]] <- which(df_2c$measure=="Zscore" 
                    & df_2c$P1orP2=="P1"
                    & df_2c$n == max(df_2c$n)
                    & df_2c$weight_or_flat=="weighted")
which_list[["Z_P2_flat"]] <- which(df_2c$measure=="Zscore" 
                    & df_2c$P1orP2=="P2"
                    & df_2c$n == max(df_2c$n)
                    & df_2c$weight_or_flat=="flat")
which_list[["Z_P2_weighted"]] <- which(df_2c$measure=="Zscore" 
                    & df_2c$P1orP2=="P2"
                    & df_2c$n == max(df_2c$n)
                    & df_2c$weight_or_flat=="weighted")


print("t-test P1 weighted / P1 flat : %diff between weighted and flat")
#> [1] "t-test P1 weighted / P1 flat : %diff between weighted and flat"
## t.test on logs gives ratio estimates
temp <- t.test(x=log10(df_2c[which_list$Z_P1_weighted,]$variance),y=log10(df_2c[which_list$Z_P1_flat,]$variance) ,paired=TRUE, conf.level = 0.95)
##  antilog to recover ratio of weighted / flat
1-10^temp$estimate
#> mean of the differences 
#>               0.7932209
1-10^temp$conf.int
#> [1] 0.8021299 0.7839107
#> attr(,"conf.level")
#> [1] 0.95
### 79.3% lower variance in weighted than flat (CI)

print("t-test P2 weighted / P2 flat : %diff between weighted and flat")
#> [1] "t-test P2 weighted / P2 flat : %diff between weighted and flat"
temp <- t.test(x=log10(df_2c[which_list$Z_P2_weighted,]$variance),y=log10(df_2c[which_list$Z_P2_flat,]$variance) ,paired=TRUE, conf.level = 0.95)
1-10^temp$estimate
#> mean of the differences 
#>               0.3578047
1-10^temp$conf.int
#> [1] 0.4196920 0.2893173
#> attr(,"conf.level")
#> [1] 0.95
### 

## jaccard between n = 1 and n = num_brain_samples for brain v non-brain OR P1 v P2 

df_2d <- data.frame(measure=character(), n=numeric(),
                 rep=numeric(), cut_point=numeric(),
                 jaccard=numeric(), P1orP2=character(),
                 weighted=logical(), samp=character(),
                 gene_count=numeric(), gene_count_baseline=numeric(),
                 gene_count_intersect=numeric())
## nested for loop builds df of jaccard results for plotting
Sys.time()
#> [1] "2022-01-02 13:47:37 PST"
for(el in names(robustness_test_results)){
  if(length(robustness_test_results[[el]])==0 ){next;}
  P1orP2 <- str_split_fixed(el,"_",n=2)[1]
  measure <- str_split_fixed(el,"_",n=2)[2]
  if(measure == "Zscore"){cuts <- seq(0,4,0.5)   ## set cutoffs
  }else{ cuts <- seq(0,1,0.05)}
  
 # for(i in 1:length(robustness_test_results[[el]]) ){   ### foreach at this level if implementing foreach
  df_temp <- foreach(i=1:length(robustness_test_results[[el]]), .errorhandling = "pass", .combine = rbind) %dopar% {
    #foreach(perm=1:length(permutation_matrix), .errorhandling = "pass", .final = function(x) setNames(x,paste("rep=", arrayInd(1:length(permutation_matrix), dim(permutation_matrix))[,2], ",n=", arrayInd(1:length(permutation_matrix), dim(permutation_matrix ))[,1], sep = ""))) %dopar% {
    library(stringr)
    df_internal <- data.frame(measure=character(), n=numeric(),
                              rep=numeric(), cut_point=numeric(), 
                              jaccard=numeric(), P1orP2=character(),
                              weighted=logical(), samp=character(),
                              gene_count=numeric(), gene_count_baseline=numeric(),
                              gene_count_intersect=numeric())
    
    n = as.numeric(str_split_fixed(names(robustness_test_results[[el]][i]), "[=,]", n=4 )[,4])
    rep = as.numeric(str_split_fixed(names(robustness_test_results[[el]][i]), "[=,]", n=4 )[,2])
    for(weighted_or_flat in c("weighted","flat")){
      baseline_name <- names(robustness_test_results[[el]])[grep(paste("rep=",rep,",n=",1,"$", sep = ""),names(robustness_test_results[[el]]) )]
      
      if(specificity_measures$out_type[measure] == "matrix"){
        weighted_flat_index <- grep(weighted_or_flat,names(robustness_test_results[[el]][[baseline_name]]))
        baseline_mat <-  robustness_test_results[[el]][[baseline_name]][[weighted_flat_index]]
        weighted_flat_index <- grep(weighted_or_flat,names(robustness_test_results[[el]][[i]]))
        temp_mat <- robustness_test_results[[el]][[i]][[weighted_flat_index]][,colnames(baseline_mat),drop=FALSE]  
      }else if(specificity_measures$out_type[measure] == "vector"){
        weighted_flat_index <- grep(weighted_or_flat,names(robustness_test_results[[el]][[baseline_name]]))
        baseline_mat <-  as.matrix(robustness_test_results[[el]][[baseline_name]][[weighted_flat_index]])
        weighted_flat_index <- grep(weighted_or_flat,names(robustness_test_results[[el]][[i]]))
        temp_mat <- as.matrix(robustness_test_results[[el]][[i]][[weighted_flat_index]])         
      }else{ stop("WARNING: no out type associated with measure") }
      
      ncols <- ncol(temp_mat); ncuts <- length(cuts)
      temp_df <- data.frame(measure=rep(measure, ncols*ncuts), n=rep(n, ncols*ncuts),
                            rep=rep(rep, ncols*ncuts), cut_point=numeric(length=ncols*ncuts),
                            jaccard=numeric(length=ncols*ncuts), P1orP2=rep(P1orP2, ncols*ncuts),
                            weighted=rep( grepl("weighted", weighted_or_flat), ncols*ncuts),
                            samp=character(length=ncols*ncuts),
                            gene_count=numeric(length=ncols*ncuts), gene_count_baseline=numeric(length=ncols*ncuts),
                            gene_count_intersect=numeric(length=ncols*ncuts)  )
      for(s in 1:ncol(temp_mat) ){  ## add a row to df for each sample   
        for(cut_index in 1:length(cuts)){
          ind <- (s-1)*length(cuts)+cut_index  ### use arr_ind here instead
          temp_df[ind,]$cut_point <- cuts[cut_index]
          if(cuts[cut_index]<0){
            temp_df[ind,]$jaccard <- (  length( which( baseline_mat[,s] <= cuts[cut_index] & temp_mat[,s] <= cuts[cut_index]  )) /
                                        length( which( baseline_mat[,s] <= cuts[cut_index] | temp_mat[,s] <= cuts[cut_index]  )) )
            temp_df[ind,]$gene_count <-  length( which(temp_mat[,s] <= cuts[cut_index]  ) )
            temp_df[ind,]$gene_count_baseline <- length( which(baseline_mat[,s] <= cuts[cut_index]  ) ) 
            temp_df[ind,]$gene_count_intersect <-  length( which( baseline_mat[,s] <= cuts[cut_index] & temp_mat[,s] <= cuts[cut_index]  ))
          } else {
            temp_df[ind,]$jaccard <- (  length( which( baseline_mat[,s] >= cuts[cut_index] & temp_mat[,s] >= cuts[cut_index]  )) /
                                        length( which( baseline_mat[,s] >= cuts[cut_index] | temp_mat[,s] >= cuts[cut_index]  )) )
            temp_df[ind,]$gene_count <-  length( which(temp_mat[,s] >= cuts[cut_index]  ) )
            temp_df[ind,]$gene_count_baseline <-  length( which(baseline_mat[,s] >= cuts[cut_index]  ) )
            temp_df[ind,]$gene_count_intersect <-  length( which( baseline_mat[,s] >= cuts[cut_index] & temp_mat[,s] >= cuts[cut_index]  ))
          }
          temp_df[ind,]$samp <- ifelse(!is.null((colnames(baseline_mat[,s,drop=FALSE] ) )),
                                       unique(colnames(baseline_mat[,s,drop=FALSE] ),
                                              colnames(temp_mat[,s,drop=FALSE] ) ), NA )
        }
      }
      df_internal <- rbind(df_internal, temp_df)
    }
    df_internal
  }
  df_2d <- rbind(df_2d, df_temp)
}
Sys.time()
#> [1] "2022-01-02 14:24:56 PST"


df_2d_summary_by_partition <-aggregate(df_2d$jaccard, by=list(measure=df_2d$measure, n=df_2d$n,
                                                        cut_point=df_2d$cut_point, P1orP2=df_2d$P1orP2,
                                                        weighted=df_2d$weighted),
                                    FUN=function(x) mean(x,na.rm=TRUE) )
names(df_2d_summary_by_partition)[length(names(df_2d_summary_by_partition))] <- "jaccard_mean"
df_2d_summary_by_partition$jaccard_sd <- aggregate(df_2d$jaccard, by=list(measure=df_2d$measure, n=df_2d$n,
                                                        cut_point=df_2d$cut_point, P1orP2=df_2d$P1orP2,
                                                        weighted=df_2d$weighted) ,
                                                FUN=function(x) sd(x,na.rm = TRUE) )$x 
df_2d_summary_by_partition$gene_count <- aggregate(df_2d$gene_count,  by=list(measure=df_2d$measure, n=df_2d$n,
                                                        cut_point=df_2d$cut_point, P1orP2=df_2d$P1orP2,
                                                        weighted=df_2d$weighted)
                                                , FUN=function(x) mean(x,na.rm = TRUE) )$x 
df_2d_summary_by_partition$gene_count_baseline <- aggregate(df_2d$gene_count_baseline, by=list(measure=df_2d$measure, n=df_2d$n,
                                                        cut_point=df_2d$cut_point, P1orP2=df_2d$P1orP2,
                                                        weighted=df_2d$weighted),
                                                        FUN=function(x) mean(x,na.rm = TRUE) )$x 
df_2d_summary_by_partition$gene_count_intersect <- aggregate(df_2d$gene_count_intersect, by=list(measure=df_2d$measure, n=df_2d$n,
                                                        cut_point=df_2d$cut_point, P1orP2=df_2d$P1orP2,
                                                        weighted=df_2d$weighted),
                                                        FUN=function(x) mean(x,na.rm = TRUE) )$x 


  
ggl_jacc <- list()
for(measure in c("Zscore", "notZscore")){
  if(measure == "Zscore"){
    which <- which(df_2d_summary_by_partition$measure == measure)
  } else {
    which <- which(df_2d_summary_by_partition$measure != "Zscore")
  }
  gg <- ggplot(df_2d_summary_by_partition[which,],
               aes(x=cut_point, y=jaccard_mean,
                   ymin=jaccard_mean-jaccard_sd/num_reps, 
                   ymax=jaccard_mean+jaccard_sd/num_reps,
                   group=n, fill=n))+
    theme_bw()+
    theme(
          panel.grid.minor.x = element_blank())+
    #ggtitle(paste("jaccard index for ", measure, " P1 and  P2, weighted and flat measures",sep=""))+
    geom_line(aes(color=n))+
    scale_color_continuous(low="powderblue", high="steelblue4")+
    scale_y_continuous(limits = c(0,1), breaks=seq(0,1,by=0.2))+
    facet_grid(P1orP2 ~ weighted, labeller=labeller(weighted=c("TRUE"="weighted", "FALSE"="flat")) )
  if(measure == "Zscore"){
    gg <- gg+scale_x_continuous(limits = c(0,4), breaks=seq(0,4,by=0.5), expand = c(0,0))
    gg <- gg + facet_grid(weighted ~ P1orP2, labeller=labeller(weighted=c("TRUE"="weighted", "FALSE"="flat")) )
  } else {
    gg <- gg+scale_x_continuous(limits = c(0,1),  breaks=seq(0,1,by=0.1) ,expand = c(0,0))
    gg <- gg + facet_grid(weighted ~ measure, labeller=labeller(weighted=c("TRUE"="weighted", "FALSE"="flat")) )
  }
  
  ggl_jacc[[measure]] <- gg
}

grobl <- list()
for(n in names(ggl_jacc)){
  grobl[[n]] <- ggplotGrob(ggl_jacc[[n]])
}
grid.arrange(grobs =list(grobl$Zscore,grobl$notZscore), nrow=2)


## for random supplemental
if(random_partition){
  grid.arrange(grobs = grobl, nrow=2, heights=c(5,4) )
}



ggl_count <- list()
for(measure in c("Zscore", "notZscore")){
  if(measure == "Zscore"){
    which <- which(df_2d_summary_by_partition$measure == measure)
  } else {
    which <- which(df_2d_summary_by_partition$measure != "Zscore")
  }
  gg <- ggplot(df_2d_summary_by_partition[which,],
               aes(x=cut_point, y=gene_count / nrow(genes),
                   group=n, fill=n))+
    theme_bw()+
    theme(
          panel.grid.minor.x = element_blank())+
    geom_line(aes(color=n))+
    scale_color_continuous(low="steelblue2", high="steelblue4")+
    #scale_y_log10( #limits = c(1,3e4),
    scale_y_continuous( limits=c(1e-4,1) , trans = log10_trans(),
                       breaks = trans_breaks("log10", function(x) 10^x), # ) #,
                       labels = percent,  )
    
  if(measure == "Zscore"){
    gg <- gg+scale_x_continuous(limits = c(0,4), breaks=seq(0,4,by=0.5), expand = c(0,0))
    gg <- gg + facet_grid(weighted ~ P1orP2, labeller=labeller(weighted=c("TRUE"="weighted", "FALSE"="flat")) )
  } else {
    gg <- gg+scale_x_continuous(limits = c(0,1),  breaks=seq(0,1,by=0.1) ,expand = c(0,0))
    gg <- gg + facet_grid(weighted ~ measure, labeller=labeller(weighted=c("TRUE"="weighted", "FALSE"="flat")) )
  }
  ggl_count[[measure]] <- gg
}
grobl <- list()
for(n in names(ggl_count)){
  grobl[[n]] <- ggplotGrob(ggl_count[[n]])
}
#> Warning: Transformation introduced infinite values in continuous y-axis
grid.arrange(grobs =  list(grobl$Zscore, grobl$notZscore), nrow=2)


## for random supplemental
if(random_partition){
  grid.arrange(grobs = grobl, nrow=2, heights=c(5,4) )
}
### stats reported in paper
## paired t-test for difference in means
### t = delta / sd / sqrt(n)
which_list <- list()
which_list[["Z_P1_flat"]] <- which(df_2d$measure=="Zscore" 
                                   & df_2d$n==max(df_2d$n)
                                   & df_2d$cut_point==2
                                   & df_2d$P1orP2=="P1"
                                   & df_2d$weighted==FALSE)
which_list[["Z_P1_weighted"]] <- which(df_2d$measure=="Zscore" 
                                   & df_2d$n==max(df_2d$n)
                                   & df_2d$cut_point==2
                                   & df_2d$P1orP2=="P1"
                                   & df_2d$weighted==TRUE)

which_list[["Z_P2_flat"]] <- which(df_2d$measure=="Zscore" 
                                   & df_2d$n==max(df_2d$n)
                                   & df_2d$cut_point==2
                                   & df_2d$P1orP2=="P2"
                                   & df_2d$weighted==FALSE)
which_list[["Z_P2_weighted"]] <- which(df_2d$measure=="Zscore" 
                                   & df_2d$n==max(df_2d$n)
                                   & df_2d$cut_point==2
                                   & df_2d$P1orP2=="P2"
                                   & df_2d$weighted==TRUE)



print("t-test P2 flat jaccard")
#> [1] "t-test P2 flat jaccard"
temp <- t.test(x=df_2d[which_list$Z_P2_flat,]$jaccard, conf.level = 0.95)
temp$estimate
#> mean of x 
#> 0.2513928
temp$conf.int
#> [1] 0.1101765 0.3926090
#> attr(,"conf.level")
#> [1] 0.95

print("t-test P2 weighted jaccard")
#> [1] "t-test P2 weighted jaccard"
temp <- t.test(x=df_2d[which_list$Z_P2_weighted,]$jaccard, conf.level = 0.95)
temp$estimate
#> mean of x 
#> 0.6359852
temp$conf.int
#> [1] 0.5740172 0.6979532
#> attr(,"conf.level")
#> [1] 0.95

print("t-test P1 flat jaccard")
#> [1] "t-test P1 flat jaccard"
temp <- t.test(x=df_2d[which_list$Z_P1_flat,]$jaccard, conf.level = 0.95)
temp$estimate
#> mean of x 
#> 0.6495905
temp$conf.int
#> [1] 0.6367873 0.6623936
#> attr(,"conf.level")
#> [1] 0.95

print("t-test P1 weighted jaccard")
#> [1] "t-test P1 weighted jaccard"
temp <- t.test(x=df_2d[which_list$Z_P1_weighted,]$jaccard, conf.level = 0.95)
temp$estimate
#> mean of x 
#>  0.807743
temp$conf.int
#> [1] 0.7983065 0.8171795
#> attr(,"conf.level")
#> [1] 0.95
if(TRUE){


  ### Matrix built from Z score first then add columns of change in other scores
  ## NOT going to be general to arbitrary sets of matrix measures -> use only Z score and then concatenate other measures
  spec_mats <- list()
  
  sim_mat <- similarity_func(exp_mat)
  sim_tree <- cluster_func(sim_mat)
  weights <- get_weights(sim_tree, colnames(exp_mat))
  flat <- weights; flat[] <- 1
  for(measure in specificity_measures$func_names){
    specificity_func <- specificity_measures$funcs[[measure]]
    if(!is.function(specificity_func)){print(paste(measure, "func is not a function")); next;}
    if(specificity_measures$out_type[measure]=="matrix"){
      spec_mat_flat <- specificity_func(exp_mat, flat)
      spec_mat_weighted <- specificity_func(exp_mat, weights)
    } else if (specificity_measures$out_type[measure]=="vector"){
      spec_mat_flat <- as.matrix(specificity_func(exp_mat, flat))
      rownames(spec_mat_flat) <- rownames(exp_mat)
      spec_mat_weighted <- as.matrix(specificity_func(exp_mat, weights))
      rownames(spec_mat_weighted) <- rownames(exp_mat)
    } else { stop("WARNING: no out type associated with measure") }
    
    spec_mats[[paste("delta", measure,sep = "_")]] <- spec_mat_weighted - spec_mat_flat
    spec_mats[[paste("weighted", measure,sep = "_")]] <- spec_mat_weighted
    spec_mats[[paste("flat", measure,sep = "_")]] <- spec_mat_flat
  }
  
  ## look at top n and bottom n genes with greatest change in specificity score between weighted and flat
  gene_list <- list(up=list(),down=list()) 
  for(spec_mat_name in names(spec_mats)[grep("delta", names(spec_mats))] ){
    spec_mat <- spec_mats[[spec_mat_name]]
    measure <- str_split_fixed(spec_mat_name, "_", 2)[,2]
    ## for each tissue for matrix measures
    if(specificity_measures$out_type[measure]=="matrix"){
      n=5 ## top n for each tissue (column) of matrix
      temp_name <- measure
      gene_list$up[[temp_name]] <- data.frame("gene"=numeric())
      gene_list$down[[temp_name]] <- data.frame("gene"=numeric())
      ## go in same order as similarity tree
      for(i in get_leaves_attr(sim_tree, "label")){
        gene_list$up[[temp_name]] <- rbind(gene_list$up[[temp_name]],
                                           data.frame("gene"=rownames(spec_mat_flat)[order((spec_mat[,i]), decreasing = TRUE)[1:n]]))
      }
      for(i in get_leaves_attr(sim_tree, "label")){
        gene_list$down[[temp_name]] <- rbind(gene_list$down[[temp_name]],
                                           data.frame("gene" = rownames(spec_mat_flat)[order((spec_mat[,i]*(-1)), decreasing = TRUE)[1:n]])) 
        }
    } else if (specificity_measures$out_type[measure]=="vector"){
    } else { stop("WARNING: no out type associated with measure") }
    
  }
 
  ### gene lists generated now to organize into heatmaps 
    ## set up color functions for heatmaps
  col_funs <- list()
  for(type in c("delta","flat","weighted")){
    col_funs[[paste( type, "Zscore" ,sep="_")]] <-colorRamp2(c(min(c(-2, min(spec_mats[[paste( type, "Zscore" ,sep="_")]], na.rm = T)) ),
                                        -1.99, -1, 0, 1, 1.99,
                                        max(c(2, max( spec_mats$delta_Zscore, na.rm = T) ) )),
                                      c("darkblue", "blue", "#B8B8FF" ,"white", "#FFB8B8",  "red","red4"))
  }
  
  ## get genes in top and bottom (note: genes may be repeated since want to see top n for each group )
  gene_vec <- data.frame(gene = character())
  for(ud in names(gene_list)){
    for(measure in names(gene_list[[ud]])){
      gene_vec <- rbind(gene_vec, gene_list[[ud]][[measure]]) 
    }
  }
  
  heatmaps <- list()
  for(type in c("delta", "flat", "weighted")){
    heatmaps[[type]] <- HeatmapList()
    for(measure in names(gene_list[["up"]])){
      spec_mat_name <- names(spec_mats)[intersect(grep(measure, names(spec_mats)), grep(type, names(spec_mats)))]
      spec_mat <-  spec_mats[[spec_mat_name ]]
      temp_name <- paste(spec_mat_name, sep="_")
      if(measure=="Zscore"){
        h1 <- Heatmap(spec_mat[gene_vec$gene,], name=temp_name,
                      row_order = gene_vec$gene, show_row_names = FALSE,
                      cluster_columns = sim_tree,
                      column_order = colnames(spec_mat),
                      col = col_funs[[paste(type,measure, sep="_")]])  
      } else { 
        h1 <- Heatmap(spec_mat[gene_vec$gene,], name=temp_name,
                      row_order = gene_vec$gene, show_row_names = FALSE,
                      col = col_funs[[paste(type,measure, sep="_")]])
      }
      if(length(heatmaps[[type]])==0){
        heatmaps[[type]] <- h1
      } else {
        heatmaps[[type]] <- heatmaps[[type]] + h1
      }
    }
  }

heatmaps$delta
heatmaps$flat + heatmaps$weighted
}


head(gtex_rowinfo)
#>                Name Description
#> 1 ENSG00000223972.5     DDX11L1
#> 2 ENSG00000227232.5      WASH7P
#> 3 ENSG00000278267.1   MIR6859-1
#> 4 ENSG00000243485.5 MIR1302-2HG
#> 5 ENSG00000237613.2     FAM138A
#> 6 ENSG00000268020.3      OR4G4P

specific_genes <- c( "OLIG1","OLIG2",
                     "PRSS1", "PTF1A", 
                     "MYH6", "MYL4")
spec_genes_w_ids <- gtex_rowinfo[which( is.element(gtex_rowinfo$Description, specific_genes)),]
spec_genes_w_ids <- spec_genes_w_ids[which(is.element(spec_genes_w_ids$Name, rownames(spec_mats$delta_Zscore))),]
spec_genes_w_ids <- spec_genes_w_ids[match(specific_genes, spec_genes_w_ids$Description),]
spec_genes_w_ids <- merge( data.frame(f_or_w = c("flat","weighted")),spec_genes_w_ids)
spec_genes_w_ids <- spec_genes_w_ids[complete.cases(spec_genes_w_ids),]
 

col_funs <- list()
for(measure in unique(str_split_fixed(names(spec_mats), "_", 2)[,2] )){
  if(measure=="Zscore"){
    col_funs[[measure]] <-colorRamp2( c( min(c(min(spec_mats$weighted_Zscore[spec_genes_w_ids$Name,], na.rm = TRUE ),
                                             min(spec_mats$flat_Zscore[spec_genes_w_ids$Name,], na.rm = TRUE),-4)),
                                    #   -2, -1.99, 0, 1.99, 2, 3.99 ,
                                      -3, -2, 0, 2, 3, 3.99,
                                       max(c(max( spec_mats$weighted_Zscore[spec_genes_w_ids$Name,], na.rm = TRUE ), 
                                             max( spec_mats$flat_Zscore[spec_genes_w_ids$Name,]), na.rm = TRUE),4)),
                                     c("darkblue", "blue", "#aaaaff" ,"white", "#ffaaaa",  "red","red4", "grey20"))
                                    #c("darkblue", "blue", "#aaaaff" ,"white", "#ffaaaa",  "red","red4"))
  } else {
    col_funs[[measure]] <-colorRamp2(c( 0, 0.4, 0.6, 0.8, 1),
                                     c("white", "lightgoldenrod1","yellow" ,"red","red4"))
    #col_funs[[measure]] <-colorRamp2(c( 0, 0.5, 1),
    #                                 c("white","red","red4"))
  }
}


heatmaps <- HeatmapList()
for(measure in unique(str_split_fixed(names(spec_mats), "_", 2)[,2] )){
  temp_mat <- matrix(nrow=nrow(spec_genes_w_ids),ncol=ncol(spec_mats[[paste("delta",measure, sep = "_")]]))
  colnames(temp_mat) <- colnames(spec_mats[[paste("delta",measure, sep = "_")]])
  rownames(temp_mat) <- spec_genes_w_ids$Name
  for(i in 1:nrow(spec_genes_w_ids)){
    temp_mat[i,] <- spec_mats[[paste(spec_genes_w_ids$f_or_w[i], measure, sep = "_")]][spec_genes_w_ids$Name[i],]
  }
  if(measure=="Zscore"){
    split <- rep(1:(nrow(spec_genes_w_ids)/2), each=2)
    ra <- rowAnnotation(foo = anno_block(labels =  paste(unique(spec_genes_w_ids$Description), "\nW     F")))
    h1 <- Heatmap(temp_mat, name=measure,
                  cluster_rows = FALSE, show_row_names = FALSE,
                  cluster_columns = sim_tree,
                  column_order = colnames(spec_mats$delta_Zscore),
                  row_split = split,
                  left_annotation = ra,
                  col = col_funs[[measure]] 
                  )
    } else {
      # h1 <- Heatmap(temp_mat, name=measure,
      #               cluster_rows = FALSE, show_row_names = TRUE,
      #               col = col_funs[[measure]])
      }
  if(length(heatmaps)==0){
      heatmaps <- h1
      # } else {
      # heatmaps <- heatmaps + h1
    }
}
draw(heatmaps)




## get particular values reported in paper
measure="Zscore"
temp_mat <- matrix(nrow=nrow(spec_genes_w_ids),ncol=ncol(spec_mats[[paste("delta",measure, sep = "_")]]))
colnames(temp_mat) <- colnames(spec_mats[[paste("delta",measure, sep = "_")]])
rownames(temp_mat) <- spec_genes_w_ids$Name
for(i in 1:nrow(spec_genes_w_ids)){
  temp_mat[i,] <- spec_mats[[paste(spec_genes_w_ids$f_or_w[i], measure, sep = "_")]][spec_genes_w_ids$Name[i],]
}
rownames(temp_mat) <- paste(c("flat","weighted"), spec_genes_w_ids$Description)
## gene ontology summaries
library(DOSE)
library(pathview)
library(clusterProfiler)
library(org.Hs.eg.db)

library(tidyverse)
library(topGO) 
library(enrichplot)

spec_mat_weighted <- calc_weighted_zscore_matrix(exp_mat, weights = weights)
flat <- weights; flat[1:length(flat)] <- 1
spec_mat_flat  <- calc_weighted_zscore_matrix(exp_mat, flat)

cutoff <- 2
which <- which((spec_mat_weighted >= cutoff & spec_mat_flat < cutoff))

allOE_genes<-str_split_fixed(rownames(spec_mat_flat),"[.]",2)[,1]

sigOE_genes <- character()
for(j in 1:ncol(spec_mat_weighted)){
 which<-which((spec_mat_flat[,j]<=2.0 & spec_mat_weighted[,j]>=2.0))
which<-as.matrix(which)
  which<- rownames(which)
  which<-str_split_fixed(which, "[.]", 2)
  which<- which[,1]
   sigOE_genes <- c(sigOE_genes, which)
}
sigOE_genes<-unique(sigOE_genes)


for(onto in c("BP")){ #},"MF","CC")){
  print(onto)
  ego <- enrichGO(gene = sigOE_genes, 
                  universe = allOE_genes,
                  keyType = "ENSEMBL",
                  OrgDb = org.Hs.eg.db, 
                  ont = onto, 
                  pAdjustMethod = "BH",
                  qvalueCutoff = 0.05,
                  readable = TRUE,
                  pool=TRUE)

  n=10
  print(dotplot(ego, showCategory = n))
  
  ego <- enrichplot::pairwise_termsim(ego)
  print(emapplot(ego, showCategory = n))
  
}
#> [1] "BP"
#> wrong orderBy parameter; set to default `orderBy = "x"`

cluster_summary <- data.frame(ego)
cluster_summary$frac_terms <- as.numeric(str_split_fixed(cluster_summary$GeneRatio,"/",2)[,1] ) / as.numeric( str_split_fixed(cluster_summary$BgRatio,"/",2)[,1] )


## only 1 similarity function tested for now, can make as list later
similarity_func <- function(exp_mat){calc_dot_product_similarity_matrix(calc_zscore_matrix(exp_mat))}

## only 1 clustering fucntion tested for now, can make as a list later
cluster_func <- function(sim_mat, method="single"){
  add_weights(add_dist_to_parent(as.dendrogram(hclust(as.dist(1-sim_mat), method = method) ) ))
}

sim_mat <- similarity_func(exp_mat)
cluster_methods <- c("single","complete","average")
for(cm in cluster_methods){
  par(mar = c(16,2,2,2))
  sim_tree <- cluster_func(sim_mat = sim_mat, method = cm)
  sim_tree %>% plot(main=cm)
}

### to do: justify method for measuring sample similarity - may require comparison or refs
calc_dot_product_similarity_matrix <- function(dat) {
  dot_product_similarity_matrix <- matrix(0, nrow = ncol(dat), ncol = ncol(dat))
  colnames(dot_product_similarity_matrix) <- colnames(dat)
  rownames(dot_product_similarity_matrix) <- colnames(dat)
  for(i in 1:ncol(dat)){
    for(j in 1:ncol(dat)){
      which_i <- which(!is.na(dat[,i])) ## ignore NAs
      which_j <- which(!is.na(dat[,j])) ## ignore NAs
      dot_product_similarity_matrix[i,j] <- sum(dat[which_i,i] * dat[which_j,j]) / (norm(dat[which_i,i],"2")*norm(dat[which_j,j],"2"))
    }
  }
  return(dot_product_similarity_matrix)
}
sim_mat<- (similarity_func(exp_mat))
sim_mat <- (sim_mat-min(sim_mat,na.rm=T)); sim_mat <- sim_mat/max(sim_mat,na.rm = T) ## coerce domain to [0-1]
sim_tree <- cluster_func(sim_mat)
heatmap(sim_mat, Rowv = rev(sim_tree), Colv = sim_tree, scale = "none", margins=c(11,13))



####ADDED
calc_eucl_matrix<- function(dat) {
  eucl<- matrix(0, nrow = ncol(dat), ncol = ncol(dat))
  colnames(eucl) <- colnames(dat)
  rownames(eucl) <- colnames(dat)
  for(i in 1:ncol(dat)){
    for(j in 1:ncol(dat)) {
       eucl[i,j]<-sqrt(sum((dat[,i]-dat[,j])^2,na.rm = T ) )
    }
  }
  return(eucl)
}
eucl_matrix<- 1-(calc_eucl_matrix(calc_zscore_matrix(exp_mat)))
eucl_matrix <- (eucl_matrix-min(eucl_matrix,na.rm=T)); eucl_matrix <- eucl_matrix/max(eucl_matrix,na.rm = T) ## coerce domain to [0-1]
sim_tree <- cluster_func(eucl_matrix)
heatmap(eucl_matrix, Rowv = rev(sim_tree), Colv = sim_tree, scale = "none", margins=c(11,13))



####ADDED
calc_manhattan_matrix<- function(dat) {
  manhatt<- matrix(0, nrow = ncol(dat), ncol = ncol(dat))
  colnames(manhatt) <- colnames(dat)
  rownames(manhatt) <- colnames(dat)
  for(i in 1:ncol(dat)){
    for(j in 1:ncol(dat)) {
       manhatt[i,j]<-sum(abs(dat[,i]-dat[,j]),na.rm = T)
    }
  }
  for(i in 1:nrow(manhatt)){
    manhatt[,i] <- manhatt[,i] / max(manhatt,na.rm=T )
  }  
  return(manhatt)
}
manhat_matrix<- 1-(calc_manhattan_matrix(calc_zscore_matrix(exp_mat)))
manhat_matrix <- (manhat_matrix-min(manhat_matrix,na.rm=T)); manhat_matrix <- manhat_matrix/max(manhat_matrix,na.rm = T) ## coerce domain to [0-1]
sim_tree <- cluster_func(manhat_matrix)
heatmap(manhat_matrix, Rowv = sim_tree, Colv = rev(sim_tree), scale = "none", margins=c(11,13))



calc_canberra_matrix<- function(dat) {
  canberra<- matrix(0, nrow = ncol(dat), ncol = ncol(dat))
  colnames(canberra) <- colnames(dat)
  rownames(canberra) <- colnames(dat)
  for(i in 1:ncol(dat)){
    for(j in 1:ncol(dat)) {
      canberra[i,j]<- sum(abs(dat[,i] - dat[,j])/ (abs(dat[,i])+abs(dat[,j])) , na.rm = T) 
    }
  }
  for(i in 1:nrow(canberra)){
    canberra[,i] <- canberra[,i] / max(canberra,na.rm=T )
  }  
  return(canberra)
}
canberra_mat<- 1-(calc_canberra_matrix( calc_zscore_matrix(exp_mat)))
canberra_mat <- (canberra_mat-min(canberra_mat,na.rm=T)); canberra_mat <- canberra_mat/max(canberra_mat,na.rm = T) ## coerce domain to [0-1]
sim_tree <- cluster_func(canberra_mat)
heatmap(canberra_mat, Rowv = sim_tree, Colv = rev(sim_tree), scale = "none", margins=c(11,13))

>>>>>>> 5239ab83e72d30aec9ac26c46b366f3acb325443